sibil/src/lexer/mod.rs

550 lines
16 KiB
Rust
Raw Normal View History

2016-12-24 09:05:10 -07:00
/* lexer.rs
* Eryn Wells <eryn@erynwells.me>
*/
pub mod token;
mod char;
mod charset;
mod number;
mod str;
2016-12-23 17:45:37 -07:00
use self::char::Lexable;
2016-12-27 10:08:44 -07:00
use self::number::Exactness;
use self::number::NumberBuilder;
use self::number::Radix;
2016-12-27 09:51:24 -07:00
use self::number::Sign;
use self::str::CharAt;
use self::str::RelativeIndexable;
use self::token::Lex;
use self::token::Token;
2016-12-27 10:47:18 -07:00
type StateResult = Result<Option<Token>, String>;
2016-12-27 12:10:15 -07:00
trait HasResult {
fn has_token(&self) -> bool;
}
#[derive(Debug)]
enum State {
Comment,
Initial,
Identifier,
2016-12-26 11:51:03 -07:00
Dot,
2016-12-24 10:29:10 -07:00
Hash,
Number,
2016-12-27 10:08:44 -07:00
NumberExactness,
NumberDecimal,
2016-12-27 10:08:44 -07:00
NumberRadix,
NumberSign,
Sign,
2016-12-25 15:03:18 -07:00
String,
}
2016-12-20 17:52:29 -08:00
pub struct Lexer {
input: String,
begin: usize,
forward: usize,
line: usize,
line_offset: usize,
state: State,
number_builder: NumberBuilder,
2016-12-20 17:38:44 -08:00
}
impl Lexer {
pub fn new(input: &str) -> Lexer {
Lexer {
input: String::from(input),
begin: 0,
forward: 0,
2016-12-24 09:17:08 -07:00
line: 1,
line_offset: 1,
state: State::Initial,
number_builder: NumberBuilder::new(),
}
}
}
impl Lexer {
fn begin_lexing(&mut self) {
self.forward = self.begin;
self.state = State::Initial;
}
/// Advance the forward pointer to the next character.
fn advance(&mut self) {
2016-12-23 17:46:28 -07:00
self.forward = self.input.index_after(self.forward);
self.line_offset += 1;
2016-12-23 17:46:28 -07:00
println!("> forward={}", self.forward);
}
/// Retract the forward pointer to the previous character.
fn retract(&mut self) {
2016-12-23 17:46:28 -07:00
self.forward = self.input.index_before(self.forward);
self.line_offset -= 1;
2016-12-23 17:46:28 -07:00
println!("< forward={}", self.forward);
}
2016-12-23 17:53:28 -07:00
/// Advance the begin pointer to prepare for the next iteration.
2016-12-23 17:46:28 -07:00
fn advance_begin(&mut self) {
self.begin = self.input.index_after(self.forward);
self.forward = self.begin;
println!("> begin={}, forward={}", self.begin, self.forward);
2016-12-23 17:46:28 -07:00
}
/// Update lexer state when it encounters a newline.
2016-12-25 13:50:34 -07:00
fn handle_newline(&mut self) {
self.line += 1;
self.line_offset = 1;
2016-12-25 13:50:34 -07:00
}
2016-12-24 09:05:10 -07:00
/// Get the substring between the two input indexes. This is the value to give to a new Token instance.
2016-12-23 17:46:28 -07:00
fn value(&self) -> String {
self.input[self.begin .. self.forward].to_string()
}
2016-12-27 10:57:16 -07:00
fn error_string(&self, message: String) -> String {
2016-12-27 10:57:16 -07:00
format!("{}:{}: {}", self.line, self.line_offset, message)
}
2016-12-27 11:18:15 -07:00
fn token_result(&self, token: Token) -> StateResult {
Ok(Some(token))
}
2016-12-27 11:23:56 -07:00
fn generic_error(&self, c: char) -> StateResult {
Err(self.error_string(format!("Invalid token character: {}", c)))
2016-12-27 11:23:56 -07:00
}
}
impl Lexer {
2016-12-25 20:49:46 -07:00
// TODO: Use std::result::Result for these state_* methods.
// https://doc.rust-lang.org/1.14.0/core/result/enum.Result.html
2016-12-23 17:53:28 -07:00
/// Handle self.state == State::Initial
fn state_initial(&mut self, c: char) -> StateResult {
2016-12-23 17:46:28 -07:00
if c.is_left_paren() {
return self.token_result(Token::LeftParen(c.to_string()));
2016-12-23 17:46:28 -07:00
}
else if c.is_right_paren() {
return self.token_result(Token::RightParen(c.to_string()));
2016-12-23 17:46:28 -07:00
}
2016-12-25 20:54:47 -07:00
else if c.is_dot() {
2016-12-26 11:51:03 -07:00
self.state = State::Dot;
self.advance();
2016-12-25 20:54:47 -07:00
}
2016-12-25 15:03:18 -07:00
else if c.is_hash() {
self.state = State::Hash;
self.advance();
}
else if c.is_string_quote() {
self.state = State::String;
self.advance();
}
2016-12-24 10:29:10 -07:00
2016-12-27 09:51:24 -07:00
else if let Some(sign) = Sign::from_char(c) {
2016-12-27 10:08:44 -07:00
self.number_builder = NumberBuilder::new();
self.number_builder.sign(sign);
self.state = State::Sign;
self.advance();
2016-12-24 09:07:38 -07:00
}
2016-12-23 17:46:28 -07:00
else if c.is_identifier_initial() {
self.state = State::Identifier;
self.advance();
}
2016-12-24 10:29:10 -07:00
else if c.is_digit(10) {
self.number_builder = NumberBuilder::new();
self.number_builder.extend_value(c);
self.state = State::Number;
self.advance();
}
2016-12-24 09:17:08 -07:00
else if c.is_whitespace() {
if c.is_newline() {
2016-12-25 13:50:34 -07:00
self.handle_newline();
2016-12-24 09:17:08 -07:00
}
self.advance_begin();
2016-12-24 09:17:08 -07:00
}
else if c.is_comment_initial() {
self.state = State::Comment;
self.advance();
}
2016-12-26 08:52:15 -07:00
else {
2016-12-27 11:23:56 -07:00
return self.generic_error(c);
2016-12-26 08:52:15 -07:00
}
Ok(None)
}
2016-12-23 17:53:28 -07:00
/// Handle self.state == State::Identifier
2016-12-27 11:23:56 -07:00
fn state_identifier(&mut self, c: char) -> StateResult {
2016-12-23 17:46:28 -07:00
if c.is_identifier_subsequent() {
2016-12-27 10:08:44 -07:00
// Stay in Identifier state.
2016-12-23 17:46:28 -07:00
self.advance();
}
2016-12-26 09:15:43 -07:00
else if c.is_identifier_delimiter() {
2016-12-27 12:10:15 -07:00
let value = self.value();
2016-12-23 17:46:28 -07:00
self.retract();
2016-12-27 12:10:15 -07:00
return self.token_result(Token::Identifier(value));
2016-12-23 17:46:28 -07:00
}
2016-12-26 08:52:15 -07:00
else {
2016-12-27 11:23:56 -07:00
return self.generic_error(c);
2016-12-26 08:52:15 -07:00
}
2016-12-27 11:23:56 -07:00
Ok(None)
2016-12-20 17:52:29 -08:00
}
2016-12-24 10:29:10 -07:00
2016-12-27 12:10:15 -07:00
/// Handle self.state == State::Dot
2016-12-27 11:36:43 -07:00
fn state_dot(&mut self, c: char) -> StateResult {
2016-12-26 11:51:03 -07:00
if c.is_identifier_delimiter() {
self.retract();
2016-12-27 11:36:43 -07:00
return self.token_result(Token::Dot);
2016-12-26 11:51:03 -07:00
}
else if c.is_digit(10) {
self.number_builder = NumberBuilder::new();
self.number_builder.extend_decimal_value(c);
self.state = State::NumberDecimal;
self.advance();
}
2016-12-26 11:51:03 -07:00
else {
2016-12-27 12:10:15 -07:00
return self.generic_error(c);
2016-12-26 11:51:03 -07:00
}
2016-12-27 11:36:43 -07:00
Ok(None)
2016-12-26 11:51:03 -07:00
}
2016-12-27 12:10:15 -07:00
/// Handle self.state == State::Hash
2016-12-27 11:52:25 -07:00
fn state_hash(&mut self, c: char) -> StateResult {
if c.is_boolean_true() || c.is_boolean_false() {
2016-12-27 10:08:44 -07:00
self.advance();
2016-12-27 11:52:25 -07:00
return self.token_result(Token::Boolean(c.is_boolean_true()));
2016-12-24 10:29:10 -07:00
}
2016-12-25 20:59:21 -07:00
else if c.is_left_paren() {
2016-12-27 10:08:44 -07:00
self.advance();
2016-12-27 11:52:25 -07:00
return self.token_result(Token::LeftVectorParen);
2016-12-27 10:08:44 -07:00
}
else if let Some(radix) = Radix::from_char(c) {
self.number_builder.radix(radix);
self.state = State::NumberRadix;
self.advance();
}
else if let Some(exactness) = Exactness::from_char(c) {
self.number_builder.exact(exactness);
self.state = State::NumberExactness;
self.advance();
2016-12-25 20:59:21 -07:00
}
2016-12-26 08:52:15 -07:00
else {
2016-12-27 12:10:15 -07:00
return self.generic_error(c);
2016-12-26 08:52:15 -07:00
}
2016-12-27 11:52:25 -07:00
Ok(None)
2016-12-24 10:29:10 -07:00
}
2016-12-25 13:50:34 -07:00
2016-12-27 12:10:15 -07:00
/// Handle self.state == State::Number
fn state_number(&mut self, c: char) -> StateResult {
if c.is_digit(self.number_builder.radix_value()) {
self.number_builder.extend_value(c);
self.advance();
}
else if c.is_dot() {
self.state = State::NumberDecimal;
self.advance();
}
else if c.is_identifier_delimiter() {
self.retract();
2016-12-27 12:10:15 -07:00
return self.token_result(Token::Number(self.number_builder.resolve()));
}
else {
2016-12-27 12:10:15 -07:00
return self.generic_error(c);
}
2016-12-27 12:10:15 -07:00
Ok(None)
}
2016-12-27 12:10:15 -07:00
fn state_number_exactness(&mut self, c: char) -> StateResult {
2016-12-27 10:08:44 -07:00
if c.is_hash() {
self.state = State::Hash;
self.advance();
}
else if let Some(sign) = Sign::from_char(c) {
self.number_builder.sign(sign);
self.state = State::NumberSign;
self.advance();
}
2016-12-27 10:08:44 -07:00
else if c.is_digit(self.number_builder.radix_value()) {
self.number_builder.extend_value(c);
self.state = State::Number;
self.advance();
}
else {
2016-12-27 12:10:15 -07:00
return self.generic_error(c);
2016-12-27 10:08:44 -07:00
}
2016-12-27 12:10:15 -07:00
Ok(None)
2016-12-27 10:08:44 -07:00
}
2016-12-27 12:10:15 -07:00
fn state_number_decimal(&mut self, c: char) -> StateResult {
if c.is_digit(Radix::Dec.value()) {
self.number_builder.extend_decimal_value(c);
self.advance();
}
else if c.is_identifier_delimiter() {
self.retract();
2016-12-27 12:10:15 -07:00
return self.token_result(Token::Number(self.number_builder.resolve()));
}
else {
2016-12-27 12:10:15 -07:00
return self.generic_error(c);
}
2016-12-27 12:10:15 -07:00
Ok(None)
}
2016-12-27 12:10:15 -07:00
fn state_number_radix(&mut self, c: char) -> StateResult {
2016-12-27 10:08:44 -07:00
if c.is_digit(self.number_builder.radix_value()) {
self.number_builder.extend_value(c);
self.state = State::Number;
self.advance();
}
else if c.is_dot() {
self.state = State::NumberDecimal;
self.advance();
}
else if c.is_hash() {
self.state = State::Hash;
self.advance();
}
else if let Some(sign) = Sign::from_char(c) {
self.number_builder.sign(sign);
self.state = State::NumberSign;
self.advance();
}
else {
2016-12-27 12:10:15 -07:00
return self.generic_error(c);
2016-12-27 10:08:44 -07:00
}
2016-12-27 12:10:15 -07:00
Ok(None)
2016-12-27 10:08:44 -07:00
}
2016-12-27 12:10:15 -07:00
fn state_number_sign(&mut self, c: char) -> StateResult {
2016-12-27 10:08:44 -07:00
if c.is_digit(self.number_builder.radix_value()) {
self.number_builder.extend_value(c);
self.state = State::Number;
self.advance();
}
else if c.is_dot() {
self.state = State::NumberDecimal;
self.advance();
}
else {
2016-12-27 12:10:15 -07:00
return self.generic_error(c);
2016-12-27 10:08:44 -07:00
}
2016-12-27 12:10:15 -07:00
Ok(None)
2016-12-27 10:08:44 -07:00
}
2016-12-27 12:10:15 -07:00
fn state_sign(&mut self, c: char) -> StateResult {
2016-12-27 10:08:44 -07:00
if c.is_digit(Radix::Dec.value()) {
self.number_builder.extend_value(c);
self.state = State::Number;
self.advance();
}
else if c.is_identifier_delimiter() {
2016-12-27 12:10:15 -07:00
let value = self.value();
2016-12-27 10:08:44 -07:00
self.retract();
2016-12-27 12:10:15 -07:00
return self.token_result(Token::Identifier(value));
2016-12-27 10:08:44 -07:00
}
else {
2016-12-27 12:10:15 -07:00
return self.generic_error(c);
2016-12-27 10:08:44 -07:00
}
2016-12-27 12:10:15 -07:00
Ok(None)
2016-12-27 10:08:44 -07:00
}
2016-12-27 12:10:15 -07:00
fn state_string(&mut self, c: char) -> StateResult {
2016-12-25 15:03:18 -07:00
self.advance();
if c.is_string_quote() {
2016-12-27 12:10:15 -07:00
return self.token_result(Token::String(self.value()));
2016-12-25 15:03:18 -07:00
}
2016-12-27 12:10:15 -07:00
Ok(None)
2016-12-25 15:03:18 -07:00
}
2016-12-27 12:10:15 -07:00
fn state_comment(&mut self, c: char) -> StateResult {
2016-12-25 13:50:34 -07:00
if c.is_newline() {
self.handle_newline();
2016-12-27 12:10:15 -07:00
return self.token_result(Token::Comment(self.value()));
2016-12-25 13:50:34 -07:00
}
else if c.is_eof() {
2016-12-27 12:10:15 -07:00
return self.token_result(Token::Comment(self.value()));
}
2016-12-26 09:15:43 -07:00
self.advance();
2016-12-27 12:10:15 -07:00
Ok(None)
2016-12-25 13:50:34 -07:00
}
2016-12-20 17:38:44 -08:00
}
impl Iterator for Lexer {
type Item = Lex;
2016-12-20 17:38:44 -08:00
fn next(&mut self) -> Option<Lex> {
self.begin_lexing();
2016-12-23 17:46:28 -07:00
if self.begin == self.input.len() {
return None;
}
let mut token: Option<Token> = None;
2016-12-23 17:46:28 -07:00
println!("Lexing '{}'", &self.input[self.begin ..]);
while token.is_none() {
let c = match self.input.char_at(self.forward) {
Some(c) => c,
None => '\0',
};
println!("{:?}! c='{}'", self.state, c);
let previous_forward = self.forward;
2016-12-27 12:10:15 -07:00
let result = match self.state {
State::Initial => self.state_initial(c),
2016-12-27 11:23:56 -07:00
State::Identifier => self.state_identifier(c),
2016-12-27 11:36:43 -07:00
State::Dot => self.state_dot(c),
2016-12-27 11:52:25 -07:00
State::Hash => self.state_hash(c),
2016-12-27 12:10:15 -07:00
State::Number => self.state_number(c),
State::NumberExactness => self.state_number_exactness(c),
State::NumberDecimal => self.state_number_decimal(c),
State::NumberRadix => self.state_number_radix(c),
State::NumberSign => self.state_number_sign(c),
State::Sign => self.state_sign(c),
State::String => self.state_string(c),
State::Comment => self.state_comment(c),
};
assert!(result.has_token() || self.forward != previous_forward, "No lexing progress made!");
if result.has_token() {
token = result.ok().unwrap();
break;
2016-12-23 17:46:28 -07:00
}
}
2016-12-23 17:46:28 -07:00
self.advance_begin();
2016-12-25 14:41:06 -07:00
match token {
Some(t) => Some(Lex::new(t, self.line, self.line_offset)),
2016-12-25 14:41:06 -07:00
None => None,
}
2016-12-20 17:38:44 -08:00
}
}
2016-12-25 14:20:25 -07:00
2016-12-27 12:10:15 -07:00
impl HasResult for StateResult {
fn has_token(&self) -> bool {
match *self {
Ok(ref token) => match *token {
Some(_) => true,
None => false,
},
Err(_) => false
}
}
}
2016-12-25 14:20:25 -07:00
//
// UNIT TESTING
2016-12-25 14:20:25 -07:00
//
#[cfg(test)]
mod tests {
use std::iter::Iterator;
use super::*;
use super::number::*;
use super::token::*;
#[test]
fn finds_parens() {
2016-12-25 20:49:15 -07:00
check_single_token("(", Token::LeftParen(String::from("(")));
check_single_token(")", Token::RightParen(String::from(")")));
2016-12-25 20:59:21 -07:00
check_single_token("#(", Token::LeftVectorParen);
}
2016-12-25 14:20:25 -07:00
2016-12-25 20:54:47 -07:00
#[test]
fn finds_dots() {
2016-12-25 20:54:47 -07:00
check_single_token(".", Token::Dot);
2016-12-26 11:51:03 -07:00
let mut lexer = Lexer::new("abc . abc");
assert_next_token(&mut lexer, &Token::Identifier(String::from("abc")));
assert_next_token(&mut lexer, &Token::Dot);
assert_next_token(&mut lexer, &Token::Identifier(String::from("abc")));
2016-12-25 20:54:47 -07:00
}
#[test]
fn finds_identifiers() {
let tok = |s: &str| { check_single_token(s, Token::Identifier(String::from(s))); };
tok("abc");
tok("number?");
tok("+");
tok("-");
}
#[test]
fn finds_booleans() {
2016-12-25 20:49:15 -07:00
check_single_token("#t", Token::Boolean(true));
check_single_token("#f", Token::Boolean(false));
}
#[test]
fn finds_comments() {
let s = "; a comment";
2016-12-25 20:49:15 -07:00
check_single_token(s, Token::Comment(String::from(s)));
}
2016-12-25 14:20:25 -07:00
#[test]
fn finds_strings() {
2016-12-25 20:49:15 -07:00
check_single_token("\"\"", Token::String(String::from("\"\"")));
check_single_token("\"abc\"", Token::String(String::from("\"abc\"")));
}
#[test]
2016-12-27 10:08:44 -07:00
fn finds_numbers() {
check_single_token(".34", Token::Number(Number::new(0.34)));
check_single_token("0.34", Token::Number(Number::new(0.34)));
}
2016-12-27 10:08:44 -07:00
#[test]
fn finds_negative_numbers() {
check_single_token("-3", Token::Number(Number::from_int(-3)));
check_single_token("-0", Token::Number(Number::from_int(-0)));
check_single_token("-0.56", Token::Number(Number::new(-0.56)));
check_single_token("-3.14159", Token::Number(Number::new(-3.14159)));
}
#[test]
fn finds_bin_numbers() {
check_single_token("#b0", Token::Number(Number::from_int(0b0)));
check_single_token("#b01011", Token::Number(Number::from_int(0b01011)));
}
#[test]
fn finds_dec_numbers() {
check_single_token("34", Token::Number(Number::new(34.0)));
check_single_token("#d89", Token::Number(Number::from_int(89)));
}
#[test]
fn finds_oct_numbers() {
check_single_token("#o45", Token::Number(Number::from_int(0o45)));
}
#[test]
fn finds_exact_numbers() {
check_single_token("#e45", Token::Number(Number::from_int(45)));
check_single_token("#e-45", Token::Number(Number::from_int(-45)));
}
2016-12-27 10:08:44 -07:00
#[test]
fn finds_hex_numbers() {
check_single_token("#h4A65", Token::Number(Number::from_int(0x4A65)));
}
2016-12-27 10:39:07 -07:00
#[test]
fn lexes_simple_sexpression() {
let mut lexer = Lexer::new("(+ 3.4 6.8)");
assert_next_token(&mut lexer, &Token::LeftParen(String::from("(")));
assert_next_token(&mut lexer, &Token::Identifier(String::from("+")));
assert_next_token(&mut lexer, &Token::Number(Number::new(3.4)));
assert_next_token(&mut lexer, &Token::Number(Number::new(6.8)));
assert_next_token(&mut lexer, &Token::RightParen(String::from(")")));
}
2016-12-25 20:49:15 -07:00
fn check_single_token(input: &str, expected: Token) {
let mut lexer = Lexer::new(input);
assert_next_token(&mut lexer, &expected);
}
2016-12-25 15:03:18 -07:00
fn assert_next_token(lexer: &mut Lexer, expected: &Token) {
let lex = lexer.next().unwrap();
assert_eq!(lex.token, *expected);
}
2016-12-25 14:20:25 -07:00
}