From e139cf0c6bb3b6db1376caeed157460d66fb03ff Mon Sep 17 00:00:00 2001 From: Eryn Wells Date: Tue, 4 Sep 2018 17:58:45 -0700 Subject: [PATCH] [lexer] Lex and discard whitespace Closes #12. --- lexer/src/lib.rs | 48 +++++++++++++++++++++++++--------- lexer/src/states/begin.rs | 11 +++++++- lexer/src/states/mod.rs | 3 +++ lexer/src/states/whitespace.rs | 30 +++++++++++++++++++++ lexer/src/token.rs | 8 +++++- 5 files changed, 86 insertions(+), 14 deletions(-) create mode 100644 lexer/src/states/whitespace.rs diff --git a/lexer/src/lib.rs b/lexer/src/lib.rs index 449afc3..393a6de 100644 --- a/lexer/src/lib.rs +++ b/lexer/src/lib.rs @@ -3,7 +3,7 @@ */ use std::iter::Peekable; -use states::*; +use states::{Begin, Resume, StateResult}; mod chars; mod error; @@ -16,8 +16,12 @@ pub use token::{Lex, Token}; pub type Result = std::result::Result; pub struct Lexer where T: Iterator { + /// The input stream. input: Peekable, + + /// Current line number. line: usize, + /// Character offset from the start of the input. offset: usize, } @@ -32,15 +36,28 @@ impl Lexer where T: Iterator { fn next(&mut self) -> Option { let out = self.input.next(); + if let Some(c) = out { + self.update_offsets(c); + } out } - fn handle_whitespace(&mut self, c: char) { - if c == '\n' { - self.line += 1; - self.offset = 1; - } else { - self.offset += 1; + fn handle_error(&self, err: Error) { + panic!("{}:{}: {}", self.line, self.offset, err.msg()) + } + + fn prepare_offsets(&mut self) { } + + fn update_offsets(&mut self, c: char) { + self.offset += 1; + match c { + '\n' => { + self.line += 1; + self.offset = 0; + }, + _ => { + self.offset += 1; + }, } } } @@ -49,8 +66,10 @@ impl Iterator for Lexer where T: Iterator { type Item = Result; fn next(&mut self) -> Option { + self.prepare_offsets(); + let mut buffer = String::new(); - let mut state: Box = Box::new(states::Begin{}); + let mut state: Box = Box::new(Begin::new()); let mut out: Option = None; loop { let peek = self.input.peek().map(char::clone); @@ -62,7 +81,7 @@ impl Iterator for Lexer where T: Iterator { out = Some(Ok(Lex::new(token, &buffer, self.line, self.offset))); break; }, - Err(err) => panic!("{}:{}: {}", self.line, self.offset, err.msg()) + Err(err) => self.handle_error(err) }, Some(c) => { let result = state.lex(c); @@ -76,6 +95,13 @@ impl Iterator for Lexer where T: Iterator { self.next(); state = to; }, + StateResult::Discard(resume) => { + buffer.clear(); + state = Box::new(Begin::new()); + if resume == Resume::AtNext { + self.next(); + } + }, StateResult::Emit(token, resume) => { if resume == Resume::AtNext { buffer.push(c); @@ -84,9 +110,7 @@ impl Iterator for Lexer where T: Iterator { out = Some(Ok(Lex::new(token, &buffer, self.line, self.offset))); break; }, - StateResult::Fail(err) => { - panic!("{}:{}: {}", self.line, self.offset, err.msg()); - } + StateResult::Fail(err) => self.handle_error(err), } }, } diff --git a/lexer/src/states/begin.rs b/lexer/src/states/begin.rs index d8770ee..4231cc6 100644 --- a/lexer/src/states/begin.rs +++ b/lexer/src/states/begin.rs @@ -9,13 +9,22 @@ use states::{Resume, State, StateResult}; use states::id::IdSub; use states::hash::Hash; use states::number::{Builder, Digit}; +use states::whitespace::Whitespace; #[derive(Debug)] pub struct Begin; +impl Begin { + pub fn new() -> Begin { + Begin{} + } +} + impl State for Begin { fn lex(&mut self, c: char) -> StateResult { - if c.is_left_paren() { + if c.is_whitespace() { + StateResult::advance(Box::new(Whitespace::new())) + } else if c.is_left_paren() { StateResult::Emit(Token::LeftParen, Resume::AtNext) } else if c.is_right_paren() { StateResult::Emit(Token::RightParen, Resume::AtNext) diff --git a/lexer/src/states/mod.rs b/lexer/src/states/mod.rs index 33f0360..d4fd04f 100644 --- a/lexer/src/states/mod.rs +++ b/lexer/src/states/mod.rs @@ -11,6 +11,7 @@ mod bool; mod hash; mod number; mod id; +mod whitespace; pub use self::begin::Begin; @@ -20,6 +21,8 @@ pub enum StateResult { Continue, /// Consume the character, advance to the provided state. Advance { to: Box }, + /// Discard the input consumed to this point. Resume as specified. + Discard(Resume), /// Emit a Lex with the provided Token and the accumulated buffer. The Resume value indicates /// whether to revisit the current input character or advance to the next one. Emit(Token, Resume), diff --git a/lexer/src/states/whitespace.rs b/lexer/src/states/whitespace.rs new file mode 100644 index 0000000..556590d --- /dev/null +++ b/lexer/src/states/whitespace.rs @@ -0,0 +1,30 @@ +/* lexer/src/states/whitespace.rs + * Eryn Wells + */ + +use error::Error; +use states::{Resume, State, StateResult}; +use token::Token; + +#[derive(Debug)] +pub struct Whitespace; + +impl Whitespace { + pub fn new() -> Whitespace { + Whitespace{} + } +} + +impl State for Whitespace { + fn lex(&mut self, c: char) -> StateResult { + if c.is_whitespace() { + StateResult::Continue + } else { + StateResult::Discard(Resume::Here) + } + } + + fn none(&mut self) -> Result, Error> { + Ok(None) + } +} diff --git a/lexer/src/token.rs b/lexer/src/token.rs index c1008d0..0453169 100644 --- a/lexer/src/token.rs +++ b/lexer/src/token.rs @@ -2,7 +2,7 @@ * Eryn Wells */ -#[derive(Debug, Eq, PartialEq)] +#[derive(Debug, Eq)] pub struct Lex { token: Token, value: String, @@ -32,3 +32,9 @@ impl Lex { pub fn token(&self) -> Token { self.token } pub fn value(&self) -> &str { self.value.as_str() } } + +impl PartialEq for Lex { + fn eq(&self, rhs: &Lex) -> bool { + self.token == rhs.token && self.value == rhs.value + } +}