#state-machine #ebnf #rules #generated #parser #token

state_machine_parser

The parser based on state machine generated by EBNF rules

1 unstable release

0.1.0 Sep 5, 2024

#348 in Programming languages

MIT license

43KB
696 lines

state_machine_parser

The parser based on state machine generated by EBNF rules.

Usage

[dependencies]
state_machine_parser = "0.1.0"

Quickstart

use std::collections::HashMap;
use state_machine_parser::{compile_bnf_rules, debug_print_match_record, MatchRecord, StateMachineParser, StateManager, Token, TokenType};

const RULE: &str = "
MultiplicationExpression = Number {(OperatorMul | OperatorDiv) Number};
Expression               = MultiplicationExpression {(OperatorAdd | OperatorSub) MultiplicationExpression};
";

#[derive(Clone, Debug, PartialEq, Eq, Hash)]
enum NumericExpressionTokenType {
    Number,
    OperatorAdd,
    OperatorSub,
    OperatorMul,
    OperatorDiv,
}
impl TokenType for NumericExpressionTokenType {}

static mut TOKEN_TYPE_CONVERTER: Option<HashMap<Vec<char>, NumericExpressionTokenType>> = None;

impl TryFrom<Vec<char>> for NumericExpressionTokenType {
    type Error = ();
    fn try_from(value: Vec<char>) -> Result<Self, Self::Error> {
        match unsafe{TOKEN_TYPE_CONVERTER.as_ref().unwrap()}.get(&value) {
            Some(t) => Ok(t.clone()),
            None => Err(())
        }
    }
}

#[derive(Debug)]
struct NumericExpressionToken {
    token_type: NumericExpressionTokenType,
    value: usize,
}

impl NumericExpressionToken {
    fn number(number: usize) -> Self {
        Self { token_type: NumericExpressionTokenType::Number, value: number }
    }
    fn operator(operator: NumericExpressionTokenType) -> Self {
        Self { token_type: operator, value: 0 }
    }
}

impl std::fmt::Display for NumericExpressionToken {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:?}", self)
    }
}

impl Token<NumericExpressionTokenType> for NumericExpressionToken {
    fn token_type(&self) -> &NumericExpressionTokenType {
        &self.token_type
    }
}

fn main() {
    unsafe {
        TOKEN_TYPE_CONVERTER = Some(HashMap::from([
            ("Number".chars().collect::<Vec<char>>(), NumericExpressionTokenType::Number),
            ("OperatorAdd".chars().collect::<Vec<char>>(), NumericExpressionTokenType::OperatorAdd),
            ("OperatorSub".chars().collect::<Vec<char>>(), NumericExpressionTokenType::OperatorSub),
            ("OperatorMul".chars().collect::<Vec<char>>(), NumericExpressionTokenType::OperatorMul),
            ("OperatorDiv".chars().collect::<Vec<char>>(), NumericExpressionTokenType::OperatorDiv),
        ]));
    }

    let state_manager: StateManager<NumericExpressionTokenType> = compile_bnf_rules(RULE).unwrap();

    let expression: Vec<NumericExpressionToken> = vec![
        NumericExpressionToken::number(1),
        NumericExpressionToken::operator(NumericExpressionTokenType::OperatorMul),
        NumericExpressionToken::number(2),
        NumericExpressionToken::operator(NumericExpressionTokenType::OperatorAdd),
        NumericExpressionToken::number(3),
        NumericExpressionToken::operator(NumericExpressionTokenType::OperatorMul),
        NumericExpressionToken::number(4),
        ];

    let start_rule: usize = *state_manager.rule_ids.get(&"Expression".chars().collect::<Vec<char>>()).unwrap();
    let match_records: Vec<MatchRecord> = StateMachineParser::new(&state_manager).parse(&expression, start_rule).unwrap();
    debug_print_match_record(&expression, &match_records, &state_manager.rule_names);
}

Run the above code and you can get the output:

{ Expression
    { MultiplicationExpression
        NumericExpressionToken { token_type: Number, value: 1 }
        NumericExpressionToken { token_type: OperatorMul, value: 0 }
        NumericExpressionToken { token_type: Number, value: 2 }
    } MultiplicationExpression
    NumericExpressionToken { token_type: OperatorAdd, value: 0 }
    { MultiplicationExpression
        NumericExpressionToken { token_type: Number, value: 3 }
        NumericExpressionToken { token_type: OperatorMul, value: 0 }
        NumericExpressionToken { token_type: Number, value: 4 }
    } MultiplicationExpression
} Expression

EBNF

Each EBNF rule has four parts: a left-hand side, a right-hand side, the "=" character separating these two sides and the ";" character marking the end of rule. The left-hand side is the name of the rule and the right-hand side is the description of the rule. The four description forms is explained below.

Form Semantic
Sequence Items appear left–to–right, their order in important.
Choice Alternative items are enclosed between "(" and ")" (parenthesis) and separated by a "|" (stroke), one item is chosen from this list of alternatives, their order is unimportant.
Option The optional item is enclosed between "[" and "]" (square–brackets), the item can be either included or discarded.
Repetition The repeatable item is enclosed between "{" and "}" (curly–braces), the item can be repeated zero or more times.

No runtime deps