1 unstable release
0.1.0 | Sep 5, 2024 |
---|
#348 in Programming languages
43KB
696 lines
state_machine_parser
The parser based on state machine generated by EBNF rules.
Usage
[dependencies]
state_machine_parser = "0.1.0"
Quickstart
use std::collections::HashMap;
use state_machine_parser::{compile_bnf_rules, debug_print_match_record, MatchRecord, StateMachineParser, StateManager, Token, TokenType};
const RULE: &str = "
MultiplicationExpression = Number {(OperatorMul | OperatorDiv) Number};
Expression = MultiplicationExpression {(OperatorAdd | OperatorSub) MultiplicationExpression};
";
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
enum NumericExpressionTokenType {
Number,
OperatorAdd,
OperatorSub,
OperatorMul,
OperatorDiv,
}
impl TokenType for NumericExpressionTokenType {}
static mut TOKEN_TYPE_CONVERTER: Option<HashMap<Vec<char>, NumericExpressionTokenType>> = None;
impl TryFrom<Vec<char>> for NumericExpressionTokenType {
type Error = ();
fn try_from(value: Vec<char>) -> Result<Self, Self::Error> {
match unsafe{TOKEN_TYPE_CONVERTER.as_ref().unwrap()}.get(&value) {
Some(t) => Ok(t.clone()),
None => Err(())
}
}
}
#[derive(Debug)]
struct NumericExpressionToken {
token_type: NumericExpressionTokenType,
value: usize,
}
impl NumericExpressionToken {
fn number(number: usize) -> Self {
Self { token_type: NumericExpressionTokenType::Number, value: number }
}
fn operator(operator: NumericExpressionTokenType) -> Self {
Self { token_type: operator, value: 0 }
}
}
impl std::fmt::Display for NumericExpressionToken {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self)
}
}
impl Token<NumericExpressionTokenType> for NumericExpressionToken {
fn token_type(&self) -> &NumericExpressionTokenType {
&self.token_type
}
}
fn main() {
unsafe {
TOKEN_TYPE_CONVERTER = Some(HashMap::from([
("Number".chars().collect::<Vec<char>>(), NumericExpressionTokenType::Number),
("OperatorAdd".chars().collect::<Vec<char>>(), NumericExpressionTokenType::OperatorAdd),
("OperatorSub".chars().collect::<Vec<char>>(), NumericExpressionTokenType::OperatorSub),
("OperatorMul".chars().collect::<Vec<char>>(), NumericExpressionTokenType::OperatorMul),
("OperatorDiv".chars().collect::<Vec<char>>(), NumericExpressionTokenType::OperatorDiv),
]));
}
let state_manager: StateManager<NumericExpressionTokenType> = compile_bnf_rules(RULE).unwrap();
let expression: Vec<NumericExpressionToken> = vec![
NumericExpressionToken::number(1),
NumericExpressionToken::operator(NumericExpressionTokenType::OperatorMul),
NumericExpressionToken::number(2),
NumericExpressionToken::operator(NumericExpressionTokenType::OperatorAdd),
NumericExpressionToken::number(3),
NumericExpressionToken::operator(NumericExpressionTokenType::OperatorMul),
NumericExpressionToken::number(4),
];
let start_rule: usize = *state_manager.rule_ids.get(&"Expression".chars().collect::<Vec<char>>()).unwrap();
let match_records: Vec<MatchRecord> = StateMachineParser::new(&state_manager).parse(&expression, start_rule).unwrap();
debug_print_match_record(&expression, &match_records, &state_manager.rule_names);
}
Run the above code and you can get the output:
{ Expression
{ MultiplicationExpression
NumericExpressionToken { token_type: Number, value: 1 }
NumericExpressionToken { token_type: OperatorMul, value: 0 }
NumericExpressionToken { token_type: Number, value: 2 }
} MultiplicationExpression
NumericExpressionToken { token_type: OperatorAdd, value: 0 }
{ MultiplicationExpression
NumericExpressionToken { token_type: Number, value: 3 }
NumericExpressionToken { token_type: OperatorMul, value: 0 }
NumericExpressionToken { token_type: Number, value: 4 }
} MultiplicationExpression
} Expression
EBNF
Each EBNF rule has four parts: a left-hand side, a right-hand side, the "=" character separating these two sides and the ";" character marking the end of rule. The left-hand side is the name of the rule and the right-hand side is the description of the rule. The four description forms is explained below.
Form | Semantic |
---|---|
Sequence | Items appear left–to–right, their order in important. |
Choice | Alternative items are enclosed between "(" and ")" (parenthesis) and separated by a "|" (stroke), one item is chosen from this list of alternatives, their order is unimportant. |
Option | The optional item is enclosed between "[" and "]" (square–brackets), the item can be either included or discarded. |
Repetition | The repeatable item is enclosed between "{" and "}" (curly–braces), the item can be repeated zero or more times. |