[lexer] Rewrite the Lexer!
Implement structs for each state in the lexer. The State objects are responsible for determining how to handle a character and emit a Result indicating to the driver (the Lexer struct) how to proceed.
This commit is contained in:
parent
d0441965eb
commit
b0b4699476
4 changed files with 135 additions and 70 deletions
111
lexer/src/lib.rs
111
lexer/src/lib.rs
|
@ -2,29 +2,19 @@
|
||||||
* Eryn Wells <eryn@erynwells.me>
|
* Eryn Wells <eryn@erynwells.me>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use std::iter::Peekable;
|
|
||||||
use chars::Lexable;
|
|
||||||
|
|
||||||
mod chars;
|
mod chars;
|
||||||
mod error;
|
mod error;
|
||||||
|
mod states;
|
||||||
mod token;
|
mod token;
|
||||||
|
|
||||||
pub use error::Error;
|
pub use error::Error;
|
||||||
pub use token::{Lex, Token};
|
pub use token::{Lex, Token};
|
||||||
|
|
||||||
|
use std::iter::Peekable;
|
||||||
|
use states::*;
|
||||||
|
|
||||||
pub type Result = std::result::Result<Lex, Error>;
|
pub type Result = std::result::Result<Lex, Error>;
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq)]
|
|
||||||
enum Resume { Here, AtNext }
|
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq)]
|
|
||||||
enum IterationResult {
|
|
||||||
Finish,
|
|
||||||
Continue,
|
|
||||||
Emit(Token, Resume),
|
|
||||||
Error(Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct Lexer<T> where T: Iterator<Item=char> {
|
pub struct Lexer<T> where T: Iterator<Item=char> {
|
||||||
input: Peekable<T>,
|
input: Peekable<T>,
|
||||||
line: usize,
|
line: usize,
|
||||||
|
@ -39,14 +29,6 @@ impl<T> Lexer<T> where T: Iterator<Item=char> {
|
||||||
offset: 0
|
offset: 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn emit(&self, token: Token, resume: Resume) -> IterationResult {
|
|
||||||
IterationResult::Emit(token, resume)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fail(&self, msg: String) -> IterationResult {
|
|
||||||
IterationResult::Error(Error::new(msg))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> Lexer<T> where T: Iterator<Item=char> {
|
impl<T> Lexer<T> where T: Iterator<Item=char> {
|
||||||
|
@ -66,59 +48,48 @@ impl<T> Iterator for Lexer<T> where T: Iterator<Item=char> {
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
let mut buffer = String::new();
|
let mut buffer = String::new();
|
||||||
|
let mut state: Box<states::State> = Box::new(states::Begin{});
|
||||||
|
let mut out: Option<Self::Item> = None;
|
||||||
loop {
|
loop {
|
||||||
let peek = self.input.peek().map(char::clone);
|
let peek = self.input.peek().map(char::clone);
|
||||||
let result = if buffer.is_empty() {
|
println!("lexing {:?} in state {:?}, buffer = {:?}", peek, state, buffer);
|
||||||
match peek {
|
match peek {
|
||||||
Some(c) if c.is_left_paren() => {
|
// TODO: Give the current state a chance to react.
|
||||||
buffer.push(c);
|
None => match state.none() {
|
||||||
self.emit(Token::LeftParen, Resume::AtNext)
|
Ok(None) => break,
|
||||||
|
Ok(Some(token)) => {
|
||||||
|
out = Some(Ok(Lex::new(token, &buffer, self.line, self.offset)));
|
||||||
|
break;
|
||||||
},
|
},
|
||||||
Some(c) if c.is_right_paren() => {
|
Err(msg) => panic!("{}", msg)
|
||||||
buffer.push(c);
|
},
|
||||||
self.emit(Token::RightParen, Resume::AtNext)
|
Some(c) => {
|
||||||
},
|
let result = state.lex(c);
|
||||||
Some(c) if c.is_whitespace() => {
|
match result {
|
||||||
self.handle_whitespace(c);
|
StateResult::Continue => {
|
||||||
IterationResult::Continue
|
buffer.push(c);
|
||||||
},
|
self.input.next();
|
||||||
Some(c) if c.is_identifier_initial() => {
|
},
|
||||||
buffer.push(c);
|
StateResult::Advance { to } => {
|
||||||
IterationResult::Continue
|
buffer.push(c);
|
||||||
},
|
self.input.next();
|
||||||
Some(c) => self.fail(format!("Invalid character: {}", c)),
|
state = to;
|
||||||
// We found EOF and there's no pending string, so just finish.
|
},
|
||||||
None => IterationResult::Finish,
|
StateResult::Emit(token, resume) => {
|
||||||
|
if resume == Resume::AtNext {
|
||||||
|
buffer.push(c);
|
||||||
|
self.input.next();
|
||||||
|
}
|
||||||
|
out = Some(Ok(Lex::new(token, &buffer, self.line, self.offset)));
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
StateResult::Fail { msg } => {
|
||||||
|
panic!("{}", msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
match peek {
|
|
||||||
Some(c) if c.is_identifier_subsequent() => {
|
|
||||||
buffer.push(c);
|
|
||||||
IterationResult::Continue
|
|
||||||
}
|
|
||||||
Some(c) if c.is_identifier_delimiter() =>
|
|
||||||
self.emit(Token::Id, Resume::Here),
|
|
||||||
Some(c) => self.fail(format!("Invalid character: {}", c)),
|
|
||||||
// Found EOF. Emit what we have and finish.
|
|
||||||
// Note: the Resume argument doesn't matter in this case since the input
|
|
||||||
// iterator will always be None from here on.
|
|
||||||
None => self.emit(Token::Id, Resume::Here),
|
|
||||||
}
|
|
||||||
};
|
|
||||||
match result {
|
|
||||||
IterationResult::Finish => break,
|
|
||||||
IterationResult::Continue => self.input.next(),
|
|
||||||
IterationResult::Emit(token, resume) => {
|
|
||||||
if resume == Resume::AtNext {
|
|
||||||
self.input.next();
|
|
||||||
}
|
|
||||||
let lex = Lex::new(token, &buffer, self.line, self.offset);
|
|
||||||
return Some(Ok(lex))
|
|
||||||
},
|
|
||||||
IterationResult::Error(err) => return Some(Err(err)),
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
None
|
out
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
31
lexer/src/states/begin.rs
Normal file
31
lexer/src/states/begin.rs
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
/* lexer/src/states/begin.rs
|
||||||
|
* Eryn Wells <eryn@erynwells.me>
|
||||||
|
*/
|
||||||
|
|
||||||
|
use chars::Lexable;
|
||||||
|
use token::Token;
|
||||||
|
use states::{Resume, State, StateResult};
|
||||||
|
use states::id::IdSub;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Begin;
|
||||||
|
|
||||||
|
impl State for Begin {
|
||||||
|
fn lex(&mut self, c: char) -> StateResult {
|
||||||
|
match c {
|
||||||
|
c if c.is_left_paren() => StateResult::Emit(Token::LeftParen, Resume::AtNext),
|
||||||
|
c if c.is_right_paren() => StateResult::Emit(Token::RightParen, Resume::AtNext),
|
||||||
|
// TODO: Figure out some way to track newlines.
|
||||||
|
c if c.is_whitespace() => StateResult::Continue,
|
||||||
|
c if c.is_identifier_initial() => StateResult::Advance { to: Box::new(IdSub{}) },
|
||||||
|
_ => {
|
||||||
|
let msg = format!("Invalid character: {}", c);
|
||||||
|
StateResult::Fail { msg }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn none(&mut self) -> Result<Option<Token>, String> {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
}
|
27
lexer/src/states/id.rs
Normal file
27
lexer/src/states/id.rs
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
/* lexer/src/states/id.rs
|
||||||
|
* Eryn Wells <eryn@erynwells.me>
|
||||||
|
*/
|
||||||
|
|
||||||
|
use chars::Lexable;
|
||||||
|
use states::{Resume, State, StateResult};
|
||||||
|
use token::Token;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct IdSub;
|
||||||
|
|
||||||
|
impl State for IdSub {
|
||||||
|
fn lex(&mut self, c: char) -> StateResult {
|
||||||
|
match c {
|
||||||
|
c if c.is_identifier_subsequent() => StateResult::Continue,
|
||||||
|
c if c.is_identifier_delimiter() => StateResult::Emit(Token::Id, Resume::Here),
|
||||||
|
_ => {
|
||||||
|
let msg = format!("Invalid character: {}", c);
|
||||||
|
StateResult::Fail { msg }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn none(&mut self) -> Result<Option<Token>, String> {
|
||||||
|
Ok(Some(Token::Id))
|
||||||
|
}
|
||||||
|
}
|
36
lexer/src/states/mod.rs
Normal file
36
lexer/src/states/mod.rs
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
/* lexer/src/states/mod.rs
|
||||||
|
* Eryn Wells <eryn@erynwells.me>
|
||||||
|
*/
|
||||||
|
|
||||||
|
mod begin;
|
||||||
|
mod id;
|
||||||
|
|
||||||
|
pub use self::begin::Begin;
|
||||||
|
|
||||||
|
use std::fmt::Debug;
|
||||||
|
use token::Token;
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum StateResult {
|
||||||
|
/// Consume the character, remain on this state.
|
||||||
|
Continue,
|
||||||
|
/// Consume the character, advance to the provided state.
|
||||||
|
Advance { to: Box<State> },
|
||||||
|
/// Emit a Lex with the provided Token and the accumulated buffer. The Resume value indicates
|
||||||
|
/// whether to revisit the current input character or advance to the next one.
|
||||||
|
Emit(Token, Resume),
|
||||||
|
Fail { msg: String }
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Eq, PartialEq)]
|
||||||
|
pub enum Resume {
|
||||||
|
/// Revisit the current input character in the next state.
|
||||||
|
Here,
|
||||||
|
/// Advance the input to the next character before beginning the next token.
|
||||||
|
AtNext
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait State: Debug {
|
||||||
|
fn lex(&mut self, c: char) -> StateResult;
|
||||||
|
fn none(&mut self) -> Result<Option<Token>, String>;
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue