[lexer] Rewrite the Lexer!
Implement structs for each state in the lexer. The State objects are responsible for determining how to handle a character and emit a Result indicating to the driver (the Lexer struct) how to proceed.
This commit is contained in:
parent
d0441965eb
commit
b0b4699476
4 changed files with 135 additions and 70 deletions
111
lexer/src/lib.rs
111
lexer/src/lib.rs
|
@ -2,29 +2,19 @@
|
|||
* Eryn Wells <eryn@erynwells.me>
|
||||
*/
|
||||
|
||||
use std::iter::Peekable;
|
||||
use chars::Lexable;
|
||||
|
||||
mod chars;
|
||||
mod error;
|
||||
mod states;
|
||||
mod token;
|
||||
|
||||
pub use error::Error;
|
||||
pub use token::{Lex, Token};
|
||||
|
||||
use std::iter::Peekable;
|
||||
use states::*;
|
||||
|
||||
pub type Result = std::result::Result<Lex, Error>;
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
enum Resume { Here, AtNext }
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
enum IterationResult {
|
||||
Finish,
|
||||
Continue,
|
||||
Emit(Token, Resume),
|
||||
Error(Error),
|
||||
}
|
||||
|
||||
pub struct Lexer<T> where T: Iterator<Item=char> {
|
||||
input: Peekable<T>,
|
||||
line: usize,
|
||||
|
@ -39,14 +29,6 @@ impl<T> Lexer<T> where T: Iterator<Item=char> {
|
|||
offset: 0
|
||||
}
|
||||
}
|
||||
|
||||
fn emit(&self, token: Token, resume: Resume) -> IterationResult {
|
||||
IterationResult::Emit(token, resume)
|
||||
}
|
||||
|
||||
fn fail(&self, msg: String) -> IterationResult {
|
||||
IterationResult::Error(Error::new(msg))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Lexer<T> where T: Iterator<Item=char> {
|
||||
|
@ -66,59 +48,48 @@ impl<T> Iterator for Lexer<T> where T: Iterator<Item=char> {
|
|||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let mut buffer = String::new();
|
||||
let mut state: Box<states::State> = Box::new(states::Begin{});
|
||||
let mut out: Option<Self::Item> = None;
|
||||
loop {
|
||||
let peek = self.input.peek().map(char::clone);
|
||||
let result = if buffer.is_empty() {
|
||||
match peek {
|
||||
Some(c) if c.is_left_paren() => {
|
||||
buffer.push(c);
|
||||
self.emit(Token::LeftParen, Resume::AtNext)
|
||||
println!("lexing {:?} in state {:?}, buffer = {:?}", peek, state, buffer);
|
||||
match peek {
|
||||
// TODO: Give the current state a chance to react.
|
||||
None => match state.none() {
|
||||
Ok(None) => break,
|
||||
Ok(Some(token)) => {
|
||||
out = Some(Ok(Lex::new(token, &buffer, self.line, self.offset)));
|
||||
break;
|
||||
},
|
||||
Some(c) if c.is_right_paren() => {
|
||||
buffer.push(c);
|
||||
self.emit(Token::RightParen, Resume::AtNext)
|
||||
},
|
||||
Some(c) if c.is_whitespace() => {
|
||||
self.handle_whitespace(c);
|
||||
IterationResult::Continue
|
||||
},
|
||||
Some(c) if c.is_identifier_initial() => {
|
||||
buffer.push(c);
|
||||
IterationResult::Continue
|
||||
},
|
||||
Some(c) => self.fail(format!("Invalid character: {}", c)),
|
||||
// We found EOF and there's no pending string, so just finish.
|
||||
None => IterationResult::Finish,
|
||||
Err(msg) => panic!("{}", msg)
|
||||
},
|
||||
Some(c) => {
|
||||
let result = state.lex(c);
|
||||
match result {
|
||||
StateResult::Continue => {
|
||||
buffer.push(c);
|
||||
self.input.next();
|
||||
},
|
||||
StateResult::Advance { to } => {
|
||||
buffer.push(c);
|
||||
self.input.next();
|
||||
state = to;
|
||||
},
|
||||
StateResult::Emit(token, resume) => {
|
||||
if resume == Resume::AtNext {
|
||||
buffer.push(c);
|
||||
self.input.next();
|
||||
}
|
||||
out = Some(Ok(Lex::new(token, &buffer, self.line, self.offset)));
|
||||
break;
|
||||
},
|
||||
StateResult::Fail { msg } => {
|
||||
panic!("{}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
match peek {
|
||||
Some(c) if c.is_identifier_subsequent() => {
|
||||
buffer.push(c);
|
||||
IterationResult::Continue
|
||||
}
|
||||
Some(c) if c.is_identifier_delimiter() =>
|
||||
self.emit(Token::Id, Resume::Here),
|
||||
Some(c) => self.fail(format!("Invalid character: {}", c)),
|
||||
// Found EOF. Emit what we have and finish.
|
||||
// Note: the Resume argument doesn't matter in this case since the input
|
||||
// iterator will always be None from here on.
|
||||
None => self.emit(Token::Id, Resume::Here),
|
||||
}
|
||||
};
|
||||
match result {
|
||||
IterationResult::Finish => break,
|
||||
IterationResult::Continue => self.input.next(),
|
||||
IterationResult::Emit(token, resume) => {
|
||||
if resume == Resume::AtNext {
|
||||
self.input.next();
|
||||
}
|
||||
let lex = Lex::new(token, &buffer, self.line, self.offset);
|
||||
return Some(Ok(lex))
|
||||
},
|
||||
IterationResult::Error(err) => return Some(Err(err)),
|
||||
};
|
||||
}
|
||||
None
|
||||
out
|
||||
}
|
||||
}
|
||||
|
|
31
lexer/src/states/begin.rs
Normal file
31
lexer/src/states/begin.rs
Normal file
|
@ -0,0 +1,31 @@
|
|||
/* lexer/src/states/begin.rs
|
||||
* Eryn Wells <eryn@erynwells.me>
|
||||
*/
|
||||
|
||||
use chars::Lexable;
|
||||
use token::Token;
|
||||
use states::{Resume, State, StateResult};
|
||||
use states::id::IdSub;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Begin;
|
||||
|
||||
impl State for Begin {
|
||||
fn lex(&mut self, c: char) -> StateResult {
|
||||
match c {
|
||||
c if c.is_left_paren() => StateResult::Emit(Token::LeftParen, Resume::AtNext),
|
||||
c if c.is_right_paren() => StateResult::Emit(Token::RightParen, Resume::AtNext),
|
||||
// TODO: Figure out some way to track newlines.
|
||||
c if c.is_whitespace() => StateResult::Continue,
|
||||
c if c.is_identifier_initial() => StateResult::Advance { to: Box::new(IdSub{}) },
|
||||
_ => {
|
||||
let msg = format!("Invalid character: {}", c);
|
||||
StateResult::Fail { msg }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn none(&mut self) -> Result<Option<Token>, String> {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
27
lexer/src/states/id.rs
Normal file
27
lexer/src/states/id.rs
Normal file
|
@ -0,0 +1,27 @@
|
|||
/* lexer/src/states/id.rs
|
||||
* Eryn Wells <eryn@erynwells.me>
|
||||
*/
|
||||
|
||||
use chars::Lexable;
|
||||
use states::{Resume, State, StateResult};
|
||||
use token::Token;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct IdSub;
|
||||
|
||||
impl State for IdSub {
|
||||
fn lex(&mut self, c: char) -> StateResult {
|
||||
match c {
|
||||
c if c.is_identifier_subsequent() => StateResult::Continue,
|
||||
c if c.is_identifier_delimiter() => StateResult::Emit(Token::Id, Resume::Here),
|
||||
_ => {
|
||||
let msg = format!("Invalid character: {}", c);
|
||||
StateResult::Fail { msg }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn none(&mut self) -> Result<Option<Token>, String> {
|
||||
Ok(Some(Token::Id))
|
||||
}
|
||||
}
|
36
lexer/src/states/mod.rs
Normal file
36
lexer/src/states/mod.rs
Normal file
|
@ -0,0 +1,36 @@
|
|||
/* lexer/src/states/mod.rs
|
||||
* Eryn Wells <eryn@erynwells.me>
|
||||
*/
|
||||
|
||||
mod begin;
|
||||
mod id;
|
||||
|
||||
pub use self::begin::Begin;
|
||||
|
||||
use std::fmt::Debug;
|
||||
use token::Token;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum StateResult {
|
||||
/// Consume the character, remain on this state.
|
||||
Continue,
|
||||
/// Consume the character, advance to the provided state.
|
||||
Advance { to: Box<State> },
|
||||
/// Emit a Lex with the provided Token and the accumulated buffer. The Resume value indicates
|
||||
/// whether to revisit the current input character or advance to the next one.
|
||||
Emit(Token, Resume),
|
||||
Fail { msg: String }
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum Resume {
|
||||
/// Revisit the current input character in the next state.
|
||||
Here,
|
||||
/// Advance the input to the next character before beginning the next token.
|
||||
AtNext
|
||||
}
|
||||
|
||||
pub trait State: Debug {
|
||||
fn lex(&mut self, c: char) -> StateResult;
|
||||
fn none(&mut self) -> Result<Option<Token>, String>;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue