20 unstable releases (5 breaking)
0.6.0 | Apr 12, 2020 |
---|---|
0.5.1 | Oct 20, 2019 |
0.4.2 | Jul 12, 2019 |
0.3.1 | Mar 30, 2019 |
0.2.1 | Nov 25, 2018 |
#502 in Text processing
53,016 downloads per month
Used in 10 crates
(2 directly)
27KB
296 lines
uwl
A Unicode-aware lexer. As it was called in its initial form.
This crate supplies a stream over a string source for manipulating its byte contents, either directly with the bytes themselves, or through their Unicode codepoint character representations.
License
Licensed under either of
- Apache License, Version 2.0 (LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0)
- MIT license (LICENSE-MIT or http://opensource.org/licenses/MIT)
at your option.
Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.
lib.rs
:
A stream designed for the individual manipulation of bytes and Unicode codepoint characters.
Example
Using the stream for creating a lexer to tokenize the English language.
use uwl::Stream;
#[derive(Debug, PartialEq)]
enum TokenKind {
Ident,
Number,
Question,
Exclamation,
Comma,
Point,
// An invalid token
Illegal,
}
#[derive(Debug, PartialEq)]
struct Token<'a> {
kind: TokenKind,
lit: &'a str,
}
impl<'a> Token<'a> {
fn new(kind: TokenKind, lit: &'a str) -> Self {
Self { kind, lit }
}
}
fn lex<'a>(stream: &mut Stream<'a>) -> Option<Token<'a>> {
let b: u8 = stream.current()?;
if b.is_ascii_whitespace() {
// Ignore whitespace.
stream.take_while(|b| b.is_ascii_whitespace());
return lex(stream);
}
if b.is_ascii_digit() {
let lit = stream.take_while(|b| b.is_ascii_digit());
return Some(Token::new(TokenKind::Number, lit));
}
if b.is_ascii_alphabetic() {
let lit = stream.take_while(|b| b.is_ascii_alphabetic());
return Some(Token::new(TokenKind::Ident, lit));
}
let token = match b {
b'?' => Some(Token::new(TokenKind::Question, &stream.rest()[..1])),
b'!' => Some(Token::new(TokenKind::Exclamation, &stream.rest()[..1])),
b',' => Some(Token::new(TokenKind::Comma, &stream.rest()[..1])),
b'.' => Some(Token::new(TokenKind::Point, &stream.rest()[..1])),
_ => Some(Token::new(TokenKind::Illegal, &stream.rest()[..1])),
};
stream.next();
token
}
fn main() {
let mut stream = Stream::new("Hello, world! ...world? Hello?");
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Ident, "Hello")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Comma, ",")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Ident, "world")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Exclamation, "!")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Point, ".")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Point, ".")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Point, ".")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Ident, "world")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Question, "?")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Ident, "Hello")));
assert_eq!(lex(&mut stream), Some(Token::new(TokenKind::Question, "?")));
// Reached the end
assert_eq!(lex(&mut stream), None);
}