From fc947280aeb7665a8c19d2ab67cde2609c37a4ca Mon Sep 17 00:00:00 2001 From: Eryn Wells Date: Sat, 15 Apr 2017 09:37:12 -0700 Subject: [PATCH] Move the lexer to its own sibillexer module Lots of failing tests right now, unfortunately. :( --- lexer/Cargo.toml | 7 + {sibil/src/lexer => lexer/src}/char.rs | 2 +- {sibil/src/lexer => lexer/src}/charset.rs | 0 sibil/src/lexer/mod.rs => lexer/src/lexer.rs | 241 ++----------------- lexer/src/lib.rs | 169 +++++++++++++ lexer/src/named_char.rs | 45 ++++ {sibil/src/lexer => lexer/src}/number.rs | 38 ++- {sibil/src/lexer => lexer/src}/str.rs | 0 {sibil/src/lexer => lexer/src}/token.rs | 6 +- 9 files changed, 264 insertions(+), 244 deletions(-) create mode 100644 lexer/Cargo.toml rename {sibil/src/lexer => lexer/src}/char.rs (99%) rename {sibil/src/lexer => lexer/src}/charset.rs (100%) rename sibil/src/lexer/mod.rs => lexer/src/lexer.rs (68%) create mode 100644 lexer/src/lib.rs create mode 100644 lexer/src/named_char.rs rename {sibil/src/lexer => lexer/src}/number.rs (78%) rename {sibil/src/lexer => lexer/src}/str.rs (100%) rename {sibil/src/lexer => lexer/src}/token.rs (84%) diff --git a/lexer/Cargo.toml b/lexer/Cargo.toml new file mode 100644 index 0000000..f8eb105 --- /dev/null +++ b/lexer/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "sibillexer" +version = "0.1.0" +authors = ["Eryn Wells "] + +[dependencies] +sibiltypes = { path = "../types" } diff --git a/sibil/src/lexer/char.rs b/lexer/src/char.rs similarity index 99% rename from sibil/src/lexer/char.rs rename to lexer/src/char.rs index ebf9ab7..d134a18 100644 --- a/sibil/src/lexer/char.rs +++ b/lexer/src/char.rs @@ -2,7 +2,7 @@ * Eryn Wells */ -use lexer::charset; +use charset; pub trait Lexable { fn is_character_leader(&self) -> bool; diff --git a/sibil/src/lexer/charset.rs b/lexer/src/charset.rs similarity index 100% rename from sibil/src/lexer/charset.rs rename to lexer/src/charset.rs diff --git a/sibil/src/lexer/mod.rs b/lexer/src/lexer.rs similarity index 68% rename from sibil/src/lexer/mod.rs rename to lexer/src/lexer.rs index d3742e6..a60f778 100644 --- a/sibil/src/lexer/mod.rs +++ b/lexer/src/lexer.rs @@ -2,69 +2,19 @@ * Eryn Wells */ -pub mod token; -pub use self::token::Lex; -pub use self::token::Token; - -mod char; -mod charset; -mod number; -mod str; - -mod named_char { - use std::collections::HashSet; - use types::Char; - - const ALARM: &'static str = "alarm"; - const BACKSPACE: &'static str = "backspace"; - const DELETE: &'static str = "delete"; - const ESCAPE: &'static str = "escape"; - const NEWLINE: &'static str = "newline"; - const NULL: &'static str = "null"; - const RETURN: &'static str = "return"; - const SPACE: &'static str = "space"; - const TAB: &'static str = "tab"; - - pub fn set() -> HashSet<&'static str> { - let mut set: HashSet<&'static str> = HashSet::new(); - set.insert(ALARM); - set.insert(BACKSPACE); - set.insert(DELETE); - set.insert(ESCAPE); - set.insert(NEWLINE); - set.insert(NULL); - set.insert(RETURN); - set.insert(SPACE); - set.insert(TAB); - set - } - - pub fn char_named_by(named: &str) -> Char { - Char::new(match named { - ALARM => '\x07', - BACKSPACE => '\x08', - DELETE => '\x7F', - ESCAPE => '\x1B', - NEWLINE => '\n', - NULL => '\0', - RETURN => '\r', - SPACE => ' ', - TAB => '\t', - _ => panic!("char_named_by called with invalid named char string") - }) - } -} - use std::collections::HashSet; +use sibiltypes::{Bool, Char}; -use types::{Bool, Char}; -use self::char::Lexable; -use self::number::Exactness; -use self::number::NumberBuilder; -use self::number::Radix; -use self::number::Sign; -use self::str::CharAt; -use self::str::RelativeIndexable; +use char::Lexable; +use named_char; +use number::Exactness; +use number::NumberBuilder; +use number::Radix; +use number::Sign; +use str::CharAt; +use str::RelativeIndexable; +use token::Lex; +use token::Token; type StateResult = Result, String>; @@ -265,7 +215,7 @@ impl Lexer { if candidates.len() > 0 { self.state = State::NamedChar(candidates, lower_c); } else { - return self.token_result(Token::Character(Char::new(c))); + return self.token_result(Token::Character(Char(c))); } Ok(None) } @@ -280,7 +230,7 @@ impl Lexer { if c.is_identifier_delimiter() || c.is_eof() { if progress.len() == 1 { self.retract(); - return self.token_result(Token::Character(Char::new(progress.chars().next().unwrap()))); + return self.token_result(Token::Character(Char(progress.chars().next().unwrap()))); } else { return self.generic_error(c); @@ -337,7 +287,7 @@ impl Lexer { fn state_hash(&mut self, c: char) -> StateResult { if c.is_boolean_true() || c.is_boolean_false() { self.advance(); - return self.token_result(Token::Boolean(Bool::new(c.is_boolean_true()))); + return self.token_result(Token::Boolean(Bool(c.is_boolean_true()))); } else if c.is_left_paren() { self.advance(); @@ -580,166 +530,3 @@ impl HasResult for StateResult { } } } - -// -// UNIT TESTING -// - -#[cfg(test)] -mod tests { - use types::{Bool, Char, Number}; - use std::iter::Iterator; - use super::*; - - #[test] - fn finds_parens() { - check_single_token("(", Token::LeftParen); - check_single_token(")", Token::RightParen); - check_single_token("#(", Token::LeftVectorParen); - } - - #[test] - fn finds_characters() { - check_single_token("#\\a", Token::Character(Char::new('a'))); - check_single_token("#\\n", Token::Character(Char::new('n'))); - check_single_token("#\\s", Token::Character(Char::new('s'))); - } - - #[test] - fn finds_named_characters() { - check_single_token("#\\newline", Token::Character(Char::new('\n'))); - check_single_token("#\\null", Token::Character(Char::new('\0'))); - check_single_token("#\\space", Token::Character(Char::new(' '))); - } - - #[test] - fn finds_dots() { - check_single_token(".", Token::Dot); - - let mut lexer = Lexer::new("abc . abc"); - assert_next_token(&mut lexer, &Token::Id(String::from("abc"))); - assert_next_token(&mut lexer, &Token::Dot); - assert_next_token(&mut lexer, &Token::Id(String::from("abc"))); - } - - #[test] - fn finds_identifiers() { - let tok = |s: &str| { check_single_token(s, Token::Id(String::from(s))); }; - tok("abc"); - tok("number?"); - tok("+"); - tok("-"); - } - - #[test] - fn finds_booleans() { - check_single_token("#t", Token::Boolean(Bool::new(true))); - check_single_token("#f", Token::Boolean(Bool::new(false))); - } - - #[test] - fn finds_comments() { - let s = "; a comment"; - check_single_token(s, Token::Comment(String::from(s))); - } - - #[test] - fn finds_escaped_characters_in_strings() { - check_single_token("\"\\\\\"", Token::String(String::from("\\"))); - check_single_token("\"\\\"\"", Token::String(String::from("\""))); - check_single_token("\"\\n\"", Token::String(String::from("\n"))); - } - - #[test] - fn finds_numbers() { - check_single_token("34", Token::Number(Number::from_float(34.0))); - check_single_token(".34", Token::Number(Number::from_float(0.34))); - check_single_token("0.34", Token::Number(Number::from_float(0.34))); - } - - #[test] - fn finds_negative_numbers() { - check_single_token("-3", Token::Number(Number::from_int(-3))); - check_single_token("-0", Token::Number(Number::from_int(-0))); - check_single_token("-0.56", Token::Number(Number::from_float(-0.56))); - check_single_token("-3.14159", Token::Number(Number::from_float(-3.14159))); - } - - #[test] - fn finds_bin_numbers() { - check_single_token("#b0", Token::Number(Number::from_int(0b0))); - check_single_token("#b01011", Token::Number(Number::from_int(0b01011))); - } - - #[test] - fn finds_dec_numbers() { - check_single_token("34", Token::Number(Number::from_float(34.0))); - check_single_token("#d89", Token::Number(Number::from_int(89))); - } - - #[test] - fn finds_oct_numbers() { - check_single_token("#o45", Token::Number(Number::from_int(0o45))); - } - - #[test] - fn finds_exact_numbers() { - check_single_token("#e45", Token::Number(Number::from_int(45))); - check_single_token("#e-45", Token::Number(Number::from_int(-45))); - } - - #[test] - fn finds_hex_numbers() { - check_single_token("#h4A65", Token::Number(Number::from_int(0x4A65))); - } - - #[test] - fn finds_quote() { - check_single_token("'", Token::Quote); - } - - #[test] - fn finds_strings() { - check_single_token("\"\"", Token::String(String::from(""))); - check_single_token("\"abc\"", Token::String(String::from("abc"))); - } - - #[test] - fn lexes_simple_expression() { - check_tokens("(+ 3.4 6.8)", vec![ - Token::LeftParen, - Token::Id(String::from("+")), - Token::Number(Number::from_float(3.4)), - Token::Number(Number::from_float(6.8)), - Token::RightParen]); - } - - #[test] - fn lexes_quoted_identifier() { - check_tokens("'abc", vec![Token::Quote, Token::Id(String::from("abc"))]); - } - - fn check_single_token(input: &str, expected: Token) { - let mut lexer = Lexer::new(input); - assert_next_token(&mut lexer, &expected); - } - - fn check_tokens(input: &str, expected: Vec) { - let lexer = Lexer::new(input); - let mut expected_iter = expected.iter(); - for lex in lexer { - if let Some(expected_token) = expected_iter.next() { - assert_eq!(lex.token, *expected_token); - } - else { - assert!(false, "Found a token we didn't expect: {:?}", lex.token); - } - } - // TODO: Check that all expected tokens are consumed. - } - - fn assert_next_token(lexer: &mut Lexer, expected: &Token) { - let lex = lexer.next().unwrap(); - assert_eq!(lex.token, *expected); - } -} diff --git a/lexer/src/lib.rs b/lexer/src/lib.rs new file mode 100644 index 0000000..4ca73eb --- /dev/null +++ b/lexer/src/lib.rs @@ -0,0 +1,169 @@ +extern crate sibiltypes; + +mod char; +mod charset; +mod lexer; +mod named_char; +mod number; +mod str; +mod token; + +#[cfg(test)] +mod tests { + use sibiltypes::{Bool, Char, Number}; + use std::iter::Iterator; + use lexer::Lexer; + use token::Token; + + #[test] + fn finds_parens() { + check_single_token("(", Token::LeftParen); + check_single_token(")", Token::RightParen); + check_single_token("#(", Token::LeftVectorParen); + } + + #[test] + fn finds_characters() { + check_single_token("#\\a", Token::Character(Char('a'))); + check_single_token("#\\n", Token::Character(Char('n'))); + check_single_token("#\\s", Token::Character(Char('s'))); + } + + #[test] + fn finds_named_characters() { + check_single_token("#\\newline", Token::Character(Char('\n'))); + check_single_token("#\\null", Token::Character(Char('\0'))); + check_single_token("#\\space", Token::Character(Char(' '))); + } + + #[test] + fn finds_dots() { + check_single_token(".", Token::Dot); + + let mut lexer = Lexer::new("abc . abc"); + assert_next_token(&mut lexer, &Token::Id(String::from("abc"))); + assert_next_token(&mut lexer, &Token::Dot); + assert_next_token(&mut lexer, &Token::Id(String::from("abc"))); + } + + #[test] + fn finds_identifiers() { + let tok = |s: &str| { check_single_token(s, Token::Id(String::from(s))); }; + tok("abc"); + tok("number?"); + tok("+"); + tok("-"); + } + + #[test] + fn finds_booleans() { + check_single_token("#t", Token::Boolean(Bool(true))); + check_single_token("#f", Token::Boolean(Bool(false))); + } + + #[test] + fn finds_comments() { + let s = "; a comment"; + check_single_token(s, Token::Comment(String::from(s))); + } + + #[test] + fn finds_escaped_characters_in_strings() { + check_single_token("\"\\\\\"", Token::String(String::from("\\"))); + check_single_token("\"\\\"\"", Token::String(String::from("\""))); + check_single_token("\"\\n\"", Token::String(String::from("\n"))); + } + + #[test] + fn finds_numbers() { + check_single_token("34", Token::Number(Number::from_int(34, true))); + check_single_token(".34", Token::Number(Number::from_float(0.34, false))); + check_single_token("0.34", Token::Number(Number::from_float(0.34, false))); + } + + #[test] + fn finds_negative_numbers() { + check_single_token("-3", Token::Number(Number::from_int(-3, true))); + check_single_token("-0", Token::Number(Number::from_int(-0, true))); + check_single_token("-0.56", Token::Number(Number::from_float(-0.56, false))); + check_single_token("-3.14159", Token::Number(Number::from_float(-3.14159, false))); + } + + #[test] + fn finds_bin_numbers() { + check_single_token("#b0", Token::Number(Number::from_int(0b0, true))); + check_single_token("#b01011", Token::Number(Number::from_int(0b01011, true))); + } + + #[test] + fn finds_dec_numbers() { + check_single_token("34", Token::Number(Number::from_int(34, true))); + check_single_token("#d89", Token::Number(Number::from_int(89, true))); + } + + #[test] + fn finds_oct_numbers() { + check_single_token("#o45", Token::Number(Number::from_int(0o45, true))); + } + + #[test] + fn finds_exact_numbers() { + check_single_token("#e45", Token::Number(Number::from_int(45, true))); + check_single_token("#e-45", Token::Number(Number::from_int(-45, true))); + } + + #[test] + fn finds_hex_numbers() { + check_single_token("#h4A65", Token::Number(Number::from_int(0x4A65, true))); + } + + #[test] + fn finds_quote() { + check_single_token("'", Token::Quote); + } + + #[test] + fn finds_strings() { + check_single_token("\"\"", Token::String(String::from(""))); + check_single_token("\"abc\"", Token::String(String::from("abc"))); + } + + #[test] + fn lexes_simple_expression() { + check_tokens("(+ 3.4 6.8)", vec![ + Token::LeftParen, + Token::Id(String::from("+")), + Token::Number(Number::from_float(3.4, false)), + Token::Number(Number::from_float(6.8, false)), + Token::RightParen]); + } + + #[test] + fn lexes_quoted_identifier() { + check_tokens("'abc", vec![Token::Quote, Token::Id(String::from("abc"))]); + } + + fn check_single_token(input: &str, expected: Token) { + let mut lexer = Lexer::new(input); + assert_next_token(&mut lexer, &expected); + } + + fn check_tokens(input: &str, expected: Vec) { + let lexer = Lexer::new(input); + let mut expected_iter = expected.iter(); + for lex in lexer { + if let Some(expected_token) = expected_iter.next() { + assert_eq!(lex.token, *expected_token); + } + else { + assert!(false, "Found a token we didn't expect: {:?}", lex.token); + } + } + // TODO: Check that all expected tokens are consumed. + } + + fn assert_next_token(lexer: &mut Lexer, expected: &Token) { + let lex = lexer.next().unwrap(); + assert_eq!(lex.token, *expected); + } +} diff --git a/lexer/src/named_char.rs b/lexer/src/named_char.rs new file mode 100644 index 0000000..36675e9 --- /dev/null +++ b/lexer/src/named_char.rs @@ -0,0 +1,45 @@ +/* lexer/src/named_char.rs + * Eryn Wells + */ + +use std::collections::HashSet; +use sibiltypes::Char; + +const ALARM: &'static str = "alarm"; +const BACKSPACE: &'static str = "backspace"; +const DELETE: &'static str = "delete"; +const ESCAPE: &'static str = "escape"; +const NEWLINE: &'static str = "newline"; +const NULL: &'static str = "null"; +const RETURN: &'static str = "return"; +const SPACE: &'static str = "space"; +const TAB: &'static str = "tab"; + +pub fn set() -> HashSet<&'static str> { + let mut set: HashSet<&'static str> = HashSet::new(); + set.insert(ALARM); + set.insert(BACKSPACE); + set.insert(DELETE); + set.insert(ESCAPE); + set.insert(NEWLINE); + set.insert(NULL); + set.insert(RETURN); + set.insert(SPACE); + set.insert(TAB); + set +} + +pub fn char_named_by(named: &str) -> Char { + Char(match named { + ALARM => '\x07', + BACKSPACE => '\x08', + DELETE => '\x7F', + ESCAPE => '\x1B', + NEWLINE => '\n', + NULL => '\0', + RETURN => '\r', + SPACE => ' ', + TAB => '\t', + _ => panic!("char_named_by called with invalid named char string") + }) +} diff --git a/sibil/src/lexer/number.rs b/lexer/src/number.rs similarity index 78% rename from sibil/src/lexer/number.rs rename to lexer/src/number.rs index 5c2c8ec..91c6833 100644 --- a/sibil/src/lexer/number.rs +++ b/lexer/src/number.rs @@ -2,7 +2,7 @@ * Eryn Wells */ -use types::Number; +use sibiltypes::Number; #[derive(Debug)] pub enum Radix { Bin, Oct, Dec, Hex } @@ -67,9 +67,20 @@ impl NumberBuilder { pub fn resolve(&self) -> Number { // TODO: Convert fields to Number type. - let value = if self.point > 0 { self.value / 10u32.pow(self.point) as f64 } else { self.value }; - let value = if self.sign == Sign::Neg { value * -1.0 } else { value }; - Number::from_float(value) + let value = if self.point > 0 { + self.value / 10u32.pow(self.point) as f64 + } + else { + self.value + }; + let value = if self.sign == Sign::Neg { + value * -1.0 + } + else { + value + }; + // TODO: Use an integer if we can. + Number::from_float(value, self.exact == Exactness::Exact) } pub fn radix_value(&self) -> u32 { @@ -133,42 +144,43 @@ impl Exactness { #[cfg(test)] mod tests { + use sibiltypes::Number; use super::*; #[test] fn builds_integers() { let mut b = NumberBuilder::new(); b.extend_value('3'); - assert_eq!(b.resolve().value, 3.0); + assert_eq!(b.resolve(), Number::from_int(3, true)); b.extend_value('4'); - assert_eq!(b.resolve().value, 34.0); + assert_eq!(b.resolve(), Number::from_int(34, true)); } #[test] fn builds_negative_integers() { let num = NumberBuilder::new().sign(Sign::Neg).extend_value('3').resolve(); - assert_eq!(num.value, -3.0); + assert_eq!(num, Number::from_int(-3, true)); } #[test] fn builds_pointy_numbers() { let mut b = NumberBuilder::new(); b.extend_value('5'); - assert_eq!(b.resolve().value, 5.0); + assert_eq!(b.resolve(), Number::from_int(5, true)); b.extend_decimal_value('3'); - assert_eq!(b.resolve().value, 5.3); + assert_eq!(b.resolve(), Number::from_float(5.3, false)); b.extend_decimal_value('4'); - assert_eq!(b.resolve().value, 5.34); + assert_eq!(b.resolve(), Number::from_float(5.34, false)); } #[test] fn builds_hex() { let mut b = NumberBuilder::new(); b.radix(Radix::Hex).extend_value('4'); - assert_eq!(b.resolve().value, 0x4 as f64); + assert_eq!(b.resolve(), Number::from_int(0x4, true)); b.extend_value('A'); - assert_eq!(b.resolve().value, 0x4A as f64); + assert_eq!(b.resolve(), Number::from_int(0x4A, true)); b.extend_value('6'); - assert_eq!(b.resolve().value, 0x4A6 as f64); + assert_eq!(b.resolve(), Number::from_int(0x4A6, true)); } } diff --git a/sibil/src/lexer/str.rs b/lexer/src/str.rs similarity index 100% rename from sibil/src/lexer/str.rs rename to lexer/src/str.rs diff --git a/sibil/src/lexer/token.rs b/lexer/src/token.rs similarity index 84% rename from sibil/src/lexer/token.rs rename to lexer/src/token.rs index e96ad39..913fe65 100644 --- a/sibil/src/lexer/token.rs +++ b/lexer/src/token.rs @@ -2,7 +2,7 @@ * Eryn Wells */ -use types::{Bool, Char, Number}; +use sibiltypes::{Bool, Char, Number}; #[derive(Debug, PartialEq)] pub enum Token { @@ -19,8 +19,8 @@ pub enum Token { String(String), } -/// A Lex is a Token extracted from a specific position in an input. It contains useful information about the token's -/// place. +/// A Lex is a Token extracted from a specific position in an input stream. It +/// contains useful information about the token's place. #[derive(Debug)] pub struct Lex { pub token: Token,