2016-12-24 09:05:10 -07:00
|
|
|
/* lexer.rs
|
|
|
|
* Eryn Wells <eryn@erynwells.me>
|
|
|
|
*/
|
2016-12-19 22:23:27 -08:00
|
|
|
|
2016-12-24 08:47:02 -07:00
|
|
|
pub mod token;
|
2016-12-24 08:57:37 -07:00
|
|
|
mod char;
|
|
|
|
mod charset;
|
2016-12-26 18:23:58 -07:00
|
|
|
mod number;
|
2016-12-24 08:57:37 -07:00
|
|
|
mod str;
|
2016-12-23 17:45:37 -07:00
|
|
|
|
2016-12-24 08:57:37 -07:00
|
|
|
use self::char::Lexable;
|
2016-12-27 10:08:44 -07:00
|
|
|
use self::number::Exactness;
|
2016-12-26 18:23:58 -07:00
|
|
|
use self::number::NumberBuilder;
|
2016-12-27 08:18:22 -07:00
|
|
|
use self::number::Radix;
|
2016-12-27 09:51:24 -07:00
|
|
|
use self::number::Sign;
|
2016-12-24 08:57:37 -07:00
|
|
|
use self::str::CharAt;
|
|
|
|
use self::str::RelativeIndexable;
|
2016-12-25 12:24:04 -07:00
|
|
|
use self::token::Lex;
|
2016-12-24 08:57:37 -07:00
|
|
|
use self::token::Token;
|
2016-12-19 22:23:27 -08:00
|
|
|
|
2016-12-27 10:47:18 -07:00
|
|
|
type StateResult = Result<Option<Token>, String>;
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
trait HasResult {
|
|
|
|
fn has_token(&self) -> bool;
|
|
|
|
}
|
|
|
|
|
2016-12-24 14:03:37 -07:00
|
|
|
#[derive(Debug)]
|
2016-12-22 09:25:31 -08:00
|
|
|
enum State {
|
2016-12-28 12:16:28 -07:00
|
|
|
Character,
|
|
|
|
CharacterNewline(NewlineState),
|
|
|
|
CharacterSpace(SpaceState),
|
2016-12-26 18:23:58 -07:00
|
|
|
Comment,
|
2016-12-22 09:25:31 -08:00
|
|
|
Initial,
|
|
|
|
Identifier,
|
2016-12-26 11:51:03 -07:00
|
|
|
Dot,
|
2016-12-24 10:29:10 -07:00
|
|
|
Hash,
|
2016-12-26 18:23:58 -07:00
|
|
|
Number,
|
2016-12-27 10:08:44 -07:00
|
|
|
NumberExactness,
|
2016-12-26 18:23:58 -07:00
|
|
|
NumberDecimal,
|
2016-12-27 10:08:44 -07:00
|
|
|
NumberRadix,
|
|
|
|
NumberSign,
|
|
|
|
Sign,
|
2016-12-25 15:03:18 -07:00
|
|
|
String,
|
2016-12-28 08:35:02 -07:00
|
|
|
StringEscape,
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
|
2016-12-28 12:16:28 -07:00
|
|
|
#[derive(Clone, PartialEq, Debug)]
|
|
|
|
enum NewlineState { N, Ne, New, Newl, Newli, Newlin, Newline }
|
|
|
|
#[derive(Clone, PartialEq, Debug)]
|
|
|
|
enum SpaceState { S, Sp, Spa, Spac, Space }
|
|
|
|
|
2016-12-20 17:52:29 -08:00
|
|
|
pub struct Lexer {
|
|
|
|
input: String,
|
2016-12-22 09:25:31 -08:00
|
|
|
begin: usize,
|
|
|
|
forward: usize,
|
2016-12-27 10:53:38 -07:00
|
|
|
line: usize,
|
|
|
|
line_offset: usize,
|
2016-12-22 09:25:31 -08:00
|
|
|
state: State,
|
2016-12-26 18:23:58 -07:00
|
|
|
number_builder: NumberBuilder,
|
2016-12-28 18:01:36 -05:00
|
|
|
string_value: String,
|
2016-12-20 17:38:44 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Lexer {
|
2016-12-25 14:33:58 -07:00
|
|
|
pub fn new(input: &str) -> Lexer {
|
2016-12-22 09:25:31 -08:00
|
|
|
Lexer {
|
2016-12-25 14:33:58 -07:00
|
|
|
input: String::from(input),
|
2016-12-22 09:25:31 -08:00
|
|
|
begin: 0,
|
|
|
|
forward: 0,
|
2016-12-24 09:17:08 -07:00
|
|
|
line: 1,
|
2016-12-27 10:53:38 -07:00
|
|
|
line_offset: 1,
|
2016-12-22 09:25:31 -08:00
|
|
|
state: State::Initial,
|
2016-12-26 18:23:58 -07:00
|
|
|
number_builder: NumberBuilder::new(),
|
2016-12-28 18:01:36 -05:00
|
|
|
string_value: String::new(),
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Lexer {
|
|
|
|
fn begin_lexing(&mut self) {
|
|
|
|
self.forward = self.begin;
|
|
|
|
self.state = State::Initial;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Advance the forward pointer to the next character.
|
|
|
|
fn advance(&mut self) {
|
2016-12-23 17:46:28 -07:00
|
|
|
self.forward = self.input.index_after(self.forward);
|
2016-12-27 10:53:38 -07:00
|
|
|
self.line_offset += 1;
|
2016-12-23 17:46:28 -07:00
|
|
|
println!("> forward={}", self.forward);
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Retract the forward pointer to the previous character.
|
|
|
|
fn retract(&mut self) {
|
2016-12-23 17:46:28 -07:00
|
|
|
self.forward = self.input.index_before(self.forward);
|
2016-12-27 10:53:38 -07:00
|
|
|
self.line_offset -= 1;
|
2016-12-23 17:46:28 -07:00
|
|
|
println!("< forward={}", self.forward);
|
|
|
|
}
|
|
|
|
|
2016-12-23 17:53:28 -07:00
|
|
|
/// Advance the begin pointer to prepare for the next iteration.
|
2016-12-23 17:46:28 -07:00
|
|
|
fn advance_begin(&mut self) {
|
|
|
|
self.begin = self.input.index_after(self.forward);
|
2016-12-24 09:59:35 -07:00
|
|
|
self.forward = self.begin;
|
|
|
|
println!("> begin={}, forward={}", self.begin, self.forward);
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 10:50:43 -07:00
|
|
|
/// Update lexer state when it encounters a newline.
|
2016-12-25 13:50:34 -07:00
|
|
|
fn handle_newline(&mut self) {
|
|
|
|
self.line += 1;
|
2016-12-27 10:53:38 -07:00
|
|
|
self.line_offset = 1;
|
2016-12-25 13:50:34 -07:00
|
|
|
}
|
|
|
|
|
2016-12-24 09:05:10 -07:00
|
|
|
/// Get the substring between the two input indexes. This is the value to give to a new Token instance.
|
2016-12-23 17:46:28 -07:00
|
|
|
fn value(&self) -> String {
|
|
|
|
self.input[self.begin .. self.forward].to_string()
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
2016-12-27 10:57:16 -07:00
|
|
|
|
2016-12-27 11:18:06 -07:00
|
|
|
fn error_string(&self, message: String) -> String {
|
2016-12-27 10:57:16 -07:00
|
|
|
format!("{}:{}: {}", self.line, self.line_offset, message)
|
|
|
|
}
|
2016-12-27 11:18:15 -07:00
|
|
|
|
|
|
|
fn token_result(&self, token: Token) -> StateResult {
|
|
|
|
Ok(Some(token))
|
|
|
|
}
|
2016-12-27 11:23:56 -07:00
|
|
|
|
2016-12-27 11:30:08 -07:00
|
|
|
fn generic_error(&self, c: char) -> StateResult {
|
|
|
|
Err(self.error_string(format!("Invalid token character: {}", c)))
|
2016-12-27 11:23:56 -07:00
|
|
|
}
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Lexer {
|
2016-12-23 17:53:28 -07:00
|
|
|
/// Handle self.state == State::Initial
|
2016-12-27 11:18:30 -07:00
|
|
|
fn state_initial(&mut self, c: char) -> StateResult {
|
2016-12-23 17:46:28 -07:00
|
|
|
if c.is_left_paren() {
|
2016-12-27 11:18:30 -07:00
|
|
|
return self.token_result(Token::LeftParen(c.to_string()));
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
|
|
|
else if c.is_right_paren() {
|
2016-12-27 11:18:30 -07:00
|
|
|
return self.token_result(Token::RightParen(c.to_string()));
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
2016-12-25 20:54:47 -07:00
|
|
|
else if c.is_dot() {
|
2016-12-26 11:51:03 -07:00
|
|
|
self.state = State::Dot;
|
|
|
|
self.advance();
|
2016-12-25 20:54:47 -07:00
|
|
|
}
|
2016-12-25 15:03:18 -07:00
|
|
|
else if c.is_hash() {
|
|
|
|
self.state = State::Hash;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-28 08:40:31 -07:00
|
|
|
else if c.is_quote() {
|
|
|
|
return self.token_result(Token::Quote);
|
|
|
|
}
|
2016-12-25 15:03:18 -07:00
|
|
|
else if c.is_string_quote() {
|
2016-12-28 18:01:36 -05:00
|
|
|
self.string_value = String::from("");
|
2016-12-25 15:03:18 -07:00
|
|
|
self.state = State::String;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-24 10:29:10 -07:00
|
|
|
|
2016-12-27 09:51:24 -07:00
|
|
|
else if let Some(sign) = Sign::from_char(c) {
|
2016-12-27 10:08:44 -07:00
|
|
|
self.number_builder = NumberBuilder::new();
|
|
|
|
self.number_builder.sign(sign);
|
|
|
|
self.state = State::Sign;
|
|
|
|
self.advance();
|
2016-12-24 09:07:38 -07:00
|
|
|
}
|
2016-12-23 17:46:28 -07:00
|
|
|
else if c.is_identifier_initial() {
|
|
|
|
self.state = State::Identifier;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-24 10:29:10 -07:00
|
|
|
|
2016-12-26 18:23:58 -07:00
|
|
|
else if c.is_digit(10) {
|
|
|
|
self.number_builder = NumberBuilder::new();
|
|
|
|
self.number_builder.extend_value(c);
|
|
|
|
self.state = State::Number;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
|
2016-12-24 09:17:08 -07:00
|
|
|
else if c.is_whitespace() {
|
|
|
|
if c.is_newline() {
|
2016-12-25 13:50:34 -07:00
|
|
|
self.handle_newline();
|
2016-12-24 09:17:08 -07:00
|
|
|
}
|
2016-12-24 09:59:35 -07:00
|
|
|
self.advance_begin();
|
2016-12-24 09:17:08 -07:00
|
|
|
}
|
2016-12-25 14:20:16 -07:00
|
|
|
|
|
|
|
else if c.is_comment_initial() {
|
|
|
|
self.state = State::Comment;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-26 08:52:15 -07:00
|
|
|
|
|
|
|
else {
|
2016-12-27 11:23:56 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-26 08:52:15 -07:00
|
|
|
}
|
2016-12-27 11:18:30 -07:00
|
|
|
|
|
|
|
Ok(None)
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
|
2016-12-23 17:53:28 -07:00
|
|
|
/// Handle self.state == State::Identifier
|
2016-12-27 11:23:56 -07:00
|
|
|
fn state_identifier(&mut self, c: char) -> StateResult {
|
2016-12-23 17:46:28 -07:00
|
|
|
if c.is_identifier_subsequent() {
|
2016-12-27 10:08:44 -07:00
|
|
|
// Stay in Identifier state.
|
2016-12-23 17:46:28 -07:00
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-26 09:15:43 -07:00
|
|
|
else if c.is_identifier_delimiter() {
|
2016-12-27 12:10:15 -07:00
|
|
|
let value = self.value();
|
2016-12-23 17:46:28 -07:00
|
|
|
self.retract();
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.token_result(Token::Identifier(value));
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
2016-12-26 08:52:15 -07:00
|
|
|
else {
|
2016-12-27 11:23:56 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-26 08:52:15 -07:00
|
|
|
}
|
2016-12-27 11:23:56 -07:00
|
|
|
Ok(None)
|
2016-12-20 17:52:29 -08:00
|
|
|
}
|
2016-12-24 10:29:10 -07:00
|
|
|
|
2016-12-28 12:16:28 -07:00
|
|
|
/// Handle self.state == State::Character
|
|
|
|
fn state_character(&mut self, c: char) -> StateResult {
|
|
|
|
self.advance();
|
|
|
|
match c {
|
|
|
|
'n' => self.state = State::CharacterNewline(NewlineState::N),
|
|
|
|
's' => self.state = State::CharacterSpace(SpaceState::S),
|
|
|
|
_ => return self.token_result(Token::Character(c)),
|
|
|
|
}
|
|
|
|
Ok(None)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Handle self.state == State::CharacterNewline
|
|
|
|
fn state_character_newline(&mut self, c: char) -> StateResult {
|
|
|
|
let substate = match self.state {
|
|
|
|
State::CharacterNewline(ref substate) => Some(substate.clone()),
|
|
|
|
_ => None,
|
|
|
|
}.unwrap();
|
|
|
|
|
|
|
|
// Assume we'll advance...
|
|
|
|
self.advance();
|
|
|
|
if substate == NewlineState::N && (c.is_identifier_delimiter() || c.is_eof()) {
|
|
|
|
return self.token_result(Token::Character('n'));
|
|
|
|
}
|
|
|
|
if let Some(next) = substate.next(c) {
|
|
|
|
match next {
|
|
|
|
NewlineState::Newline => return self.token_result(Token::Character('\n')),
|
|
|
|
_ => self.state = State::CharacterNewline(next),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// ... but retract if we failed.
|
|
|
|
self.retract();
|
|
|
|
return Err(self.error_string(format!("Invalid character while building #\\newline: '{}'", c)));
|
|
|
|
}
|
|
|
|
Ok(None)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Handle self.state == State::CharacterNewline
|
|
|
|
fn state_character_space(&mut self, c: char) -> StateResult {
|
|
|
|
let substate = match self.state {
|
|
|
|
State::CharacterSpace(ref substate) => Some(substate.clone()),
|
|
|
|
_ => None,
|
|
|
|
}.unwrap();
|
|
|
|
|
|
|
|
// Assume we'll advance...
|
|
|
|
self.advance();
|
|
|
|
if substate == SpaceState::S && (c.is_identifier_delimiter() || c.is_eof()) {
|
|
|
|
return self.token_result(Token::Character('s'));
|
|
|
|
}
|
|
|
|
if let Some(next) = substate.next(c) {
|
|
|
|
match next {
|
|
|
|
SpaceState::Space => return self.token_result(Token::Character(' ')),
|
|
|
|
_ => self.state = State::CharacterSpace(next),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// ... but retract if we failed.
|
|
|
|
self.retract();
|
|
|
|
return Err(self.error_string(format!("Invalid character while building #\\space: '{}'", c)));
|
|
|
|
}
|
|
|
|
Ok(None)
|
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
/// Handle self.state == State::Dot
|
2016-12-27 11:36:43 -07:00
|
|
|
fn state_dot(&mut self, c: char) -> StateResult {
|
2016-12-26 11:51:03 -07:00
|
|
|
if c.is_identifier_delimiter() {
|
|
|
|
self.retract();
|
2016-12-27 11:36:43 -07:00
|
|
|
return self.token_result(Token::Dot);
|
2016-12-26 11:51:03 -07:00
|
|
|
}
|
2016-12-26 18:23:58 -07:00
|
|
|
else if c.is_digit(10) {
|
|
|
|
self.number_builder = NumberBuilder::new();
|
|
|
|
self.number_builder.extend_decimal_value(c);
|
|
|
|
self.state = State::NumberDecimal;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-26 11:51:03 -07:00
|
|
|
else {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-26 11:51:03 -07:00
|
|
|
}
|
2016-12-27 11:36:43 -07:00
|
|
|
Ok(None)
|
2016-12-26 11:51:03 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
/// Handle self.state == State::Hash
|
2016-12-27 11:52:25 -07:00
|
|
|
fn state_hash(&mut self, c: char) -> StateResult {
|
2016-12-25 12:24:04 -07:00
|
|
|
if c.is_boolean_true() || c.is_boolean_false() {
|
2016-12-27 10:08:44 -07:00
|
|
|
self.advance();
|
2016-12-27 11:52:25 -07:00
|
|
|
return self.token_result(Token::Boolean(c.is_boolean_true()));
|
2016-12-24 10:29:10 -07:00
|
|
|
}
|
2016-12-25 20:59:21 -07:00
|
|
|
else if c.is_left_paren() {
|
2016-12-27 10:08:44 -07:00
|
|
|
self.advance();
|
2016-12-27 11:52:25 -07:00
|
|
|
return self.token_result(Token::LeftVectorParen);
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
2016-12-28 12:16:28 -07:00
|
|
|
else if c.is_character_leader() {
|
|
|
|
self.state = State::Character;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-27 10:08:44 -07:00
|
|
|
else if let Some(radix) = Radix::from_char(c) {
|
|
|
|
self.number_builder.radix(radix);
|
|
|
|
self.state = State::NumberRadix;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if let Some(exactness) = Exactness::from_char(c) {
|
|
|
|
self.number_builder.exact(exactness);
|
|
|
|
self.state = State::NumberExactness;
|
|
|
|
self.advance();
|
2016-12-25 20:59:21 -07:00
|
|
|
}
|
2016-12-26 08:52:15 -07:00
|
|
|
else {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-26 08:52:15 -07:00
|
|
|
}
|
2016-12-27 11:52:25 -07:00
|
|
|
Ok(None)
|
2016-12-24 10:29:10 -07:00
|
|
|
}
|
2016-12-25 13:50:34 -07:00
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
/// Handle self.state == State::Number
|
|
|
|
fn state_number(&mut self, c: char) -> StateResult {
|
2016-12-26 18:23:58 -07:00
|
|
|
if c.is_digit(self.number_builder.radix_value()) {
|
|
|
|
self.number_builder.extend_value(c);
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if c.is_dot() {
|
|
|
|
self.state = State::NumberDecimal;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if c.is_identifier_delimiter() {
|
|
|
|
self.retract();
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.token_result(Token::Number(self.number_builder.resolve()));
|
2016-12-26 18:23:58 -07:00
|
|
|
}
|
|
|
|
else {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-26 18:23:58 -07:00
|
|
|
}
|
2016-12-27 12:10:15 -07:00
|
|
|
Ok(None)
|
2016-12-26 18:23:58 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
fn state_number_exactness(&mut self, c: char) -> StateResult {
|
2016-12-27 10:08:44 -07:00
|
|
|
if c.is_hash() {
|
|
|
|
self.state = State::Hash;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-27 10:14:03 -07:00
|
|
|
else if let Some(sign) = Sign::from_char(c) {
|
|
|
|
self.number_builder.sign(sign);
|
|
|
|
self.state = State::NumberSign;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-27 10:08:44 -07:00
|
|
|
else if c.is_digit(self.number_builder.radix_value()) {
|
|
|
|
self.number_builder.extend_value(c);
|
|
|
|
self.state = State::Number;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
2016-12-27 12:10:15 -07:00
|
|
|
Ok(None)
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
fn state_number_decimal(&mut self, c: char) -> StateResult {
|
2016-12-27 08:18:22 -07:00
|
|
|
if c.is_digit(Radix::Dec.value()) {
|
2016-12-26 18:23:58 -07:00
|
|
|
self.number_builder.extend_decimal_value(c);
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if c.is_identifier_delimiter() {
|
|
|
|
self.retract();
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.token_result(Token::Number(self.number_builder.resolve()));
|
2016-12-26 18:23:58 -07:00
|
|
|
}
|
|
|
|
else {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-26 18:23:58 -07:00
|
|
|
}
|
2016-12-27 12:10:15 -07:00
|
|
|
Ok(None)
|
2016-12-26 18:23:58 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
fn state_number_radix(&mut self, c: char) -> StateResult {
|
2016-12-27 10:08:44 -07:00
|
|
|
if c.is_digit(self.number_builder.radix_value()) {
|
|
|
|
self.number_builder.extend_value(c);
|
|
|
|
self.state = State::Number;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if c.is_dot() {
|
|
|
|
self.state = State::NumberDecimal;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if c.is_hash() {
|
|
|
|
self.state = State::Hash;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if let Some(sign) = Sign::from_char(c) {
|
|
|
|
self.number_builder.sign(sign);
|
|
|
|
self.state = State::NumberSign;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
2016-12-27 12:10:15 -07:00
|
|
|
Ok(None)
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
fn state_number_sign(&mut self, c: char) -> StateResult {
|
2016-12-27 10:08:44 -07:00
|
|
|
if c.is_digit(self.number_builder.radix_value()) {
|
|
|
|
self.number_builder.extend_value(c);
|
|
|
|
self.state = State::Number;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if c.is_dot() {
|
|
|
|
self.state = State::NumberDecimal;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
2016-12-27 12:10:15 -07:00
|
|
|
Ok(None)
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
fn state_sign(&mut self, c: char) -> StateResult {
|
2016-12-27 10:08:44 -07:00
|
|
|
if c.is_digit(Radix::Dec.value()) {
|
|
|
|
self.number_builder.extend_value(c);
|
|
|
|
self.state = State::Number;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if c.is_identifier_delimiter() {
|
2016-12-27 12:10:15 -07:00
|
|
|
let value = self.value();
|
2016-12-27 10:08:44 -07:00
|
|
|
self.retract();
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.token_result(Token::Identifier(value));
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
|
|
|
else {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.generic_error(c);
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
2016-12-27 12:10:15 -07:00
|
|
|
Ok(None)
|
2016-12-27 10:08:44 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
fn state_string(&mut self, c: char) -> StateResult {
|
2016-12-25 15:03:18 -07:00
|
|
|
self.advance();
|
|
|
|
if c.is_string_quote() {
|
2016-12-28 18:01:36 -05:00
|
|
|
return self.token_result(Token::String(self.string_value.clone()));
|
2016-12-25 15:03:18 -07:00
|
|
|
}
|
2016-12-28 08:35:02 -07:00
|
|
|
else if c.is_string_escape_leader() {
|
|
|
|
self.state = State::StringEscape;
|
|
|
|
}
|
2016-12-28 18:01:36 -05:00
|
|
|
else {
|
|
|
|
self.string_value.push(c);
|
|
|
|
}
|
2016-12-28 08:35:02 -07:00
|
|
|
Ok(None)
|
|
|
|
}
|
|
|
|
|
|
|
|
fn state_string_escape(&mut self, c: char) -> StateResult {
|
2016-12-28 18:01:36 -05:00
|
|
|
let char_to_push = match c {
|
|
|
|
'0' => '\0',
|
|
|
|
'n' => '\n',
|
|
|
|
't' => '\t',
|
|
|
|
'"' => '"',
|
|
|
|
'\\' => '\\',
|
|
|
|
_ => return Err(self.error_string(format!("Invalid string escape character: {}", c))),
|
|
|
|
};
|
|
|
|
self.string_value.push(char_to_push);
|
|
|
|
self.state = State::String;
|
|
|
|
self.advance();
|
2016-12-27 12:10:15 -07:00
|
|
|
Ok(None)
|
2016-12-25 15:03:18 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
fn state_comment(&mut self, c: char) -> StateResult {
|
2016-12-25 13:50:34 -07:00
|
|
|
if c.is_newline() {
|
|
|
|
self.handle_newline();
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.token_result(Token::Comment(self.value()));
|
2016-12-25 13:50:34 -07:00
|
|
|
}
|
2016-12-25 14:34:11 -07:00
|
|
|
else if c.is_eof() {
|
2016-12-27 12:10:15 -07:00
|
|
|
return self.token_result(Token::Comment(self.value()));
|
2016-12-25 14:34:11 -07:00
|
|
|
}
|
2016-12-26 09:15:43 -07:00
|
|
|
self.advance();
|
2016-12-27 12:10:15 -07:00
|
|
|
Ok(None)
|
2016-12-25 13:50:34 -07:00
|
|
|
}
|
2016-12-20 17:38:44 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Iterator for Lexer {
|
2016-12-25 12:24:04 -07:00
|
|
|
type Item = Lex;
|
2016-12-20 17:38:44 -08:00
|
|
|
|
2016-12-25 12:24:04 -07:00
|
|
|
fn next(&mut self) -> Option<Lex> {
|
2016-12-22 09:25:31 -08:00
|
|
|
self.begin_lexing();
|
2016-12-23 17:46:28 -07:00
|
|
|
if self.begin == self.input.len() {
|
|
|
|
return None;
|
|
|
|
}
|
2016-12-25 14:20:16 -07:00
|
|
|
let mut token: Option<Token> = None;
|
2016-12-23 17:46:28 -07:00
|
|
|
println!("Lexing '{}'", &self.input[self.begin ..]);
|
2016-12-25 14:20:16 -07:00
|
|
|
while token.is_none() {
|
|
|
|
let c = match self.input.char_at(self.forward) {
|
|
|
|
Some(c) => c,
|
|
|
|
None => '\0',
|
|
|
|
};
|
|
|
|
println!("{:?}! c='{}'", self.state, c);
|
2016-12-26 09:15:53 -07:00
|
|
|
let previous_forward = self.forward;
|
2016-12-27 12:10:15 -07:00
|
|
|
let result = match self.state {
|
2016-12-28 12:16:28 -07:00
|
|
|
State::Character => self.state_character(c),
|
|
|
|
State::CharacterNewline(_) => self.state_character_newline(c),
|
|
|
|
State::CharacterSpace(_) => self.state_character_space(c),
|
|
|
|
State::Comment => self.state_comment(c),
|
2016-12-27 11:36:43 -07:00
|
|
|
State::Dot => self.state_dot(c),
|
2016-12-27 11:52:25 -07:00
|
|
|
State::Hash => self.state_hash(c),
|
2016-12-28 12:16:28 -07:00
|
|
|
State::Identifier => self.state_identifier(c),
|
|
|
|
State::Initial => self.state_initial(c),
|
2016-12-27 12:10:15 -07:00
|
|
|
State::Number => self.state_number(c),
|
|
|
|
State::NumberDecimal => self.state_number_decimal(c),
|
2016-12-28 12:16:28 -07:00
|
|
|
State::NumberExactness => self.state_number_exactness(c),
|
2016-12-27 12:10:15 -07:00
|
|
|
State::NumberRadix => self.state_number_radix(c),
|
|
|
|
State::NumberSign => self.state_number_sign(c),
|
|
|
|
State::Sign => self.state_sign(c),
|
|
|
|
State::String => self.state_string(c),
|
2016-12-28 08:35:02 -07:00
|
|
|
State::StringEscape => self.state_string_escape(c),
|
2016-12-27 12:10:15 -07:00
|
|
|
};
|
|
|
|
assert!(result.has_token() || self.forward != previous_forward, "No lexing progress made!");
|
|
|
|
if result.has_token() {
|
|
|
|
token = result.ok().unwrap();
|
|
|
|
break;
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
2016-12-27 12:13:21 -07:00
|
|
|
else if result.is_err() {
|
|
|
|
assert!(false, "{}", result.err().unwrap());
|
|
|
|
}
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
2016-12-23 17:46:28 -07:00
|
|
|
self.advance_begin();
|
2016-12-25 14:41:06 -07:00
|
|
|
match token {
|
2016-12-27 10:53:38 -07:00
|
|
|
Some(t) => Some(Lex::new(t, self.line, self.line_offset)),
|
2016-12-25 14:41:06 -07:00
|
|
|
None => None,
|
2016-12-25 14:20:16 -07:00
|
|
|
}
|
2016-12-20 17:38:44 -08:00
|
|
|
}
|
|
|
|
}
|
2016-12-25 14:20:25 -07:00
|
|
|
|
2016-12-27 12:10:15 -07:00
|
|
|
impl HasResult for StateResult {
|
|
|
|
fn has_token(&self) -> bool {
|
|
|
|
match *self {
|
|
|
|
Ok(ref token) => match *token {
|
|
|
|
Some(_) => true,
|
|
|
|
None => false,
|
|
|
|
},
|
|
|
|
Err(_) => false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-28 12:16:28 -07:00
|
|
|
impl NewlineState {
|
|
|
|
fn next(&self, c: char) -> Option<NewlineState> {
|
|
|
|
match *self {
|
|
|
|
NewlineState::N => match c {
|
|
|
|
'e' => Some(NewlineState::Ne),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
NewlineState::Ne => match c {
|
|
|
|
'w' => Some(NewlineState::New),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
NewlineState::New => match c {
|
|
|
|
'l' => Some(NewlineState::Newl),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
NewlineState::Newl => match c {
|
|
|
|
'i' => Some(NewlineState::Newli),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
NewlineState::Newli => match c {
|
|
|
|
'n' => Some(NewlineState::Newlin),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
NewlineState::Newlin => match c {
|
|
|
|
'e' => Some(NewlineState::Newline),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
_ => None,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl SpaceState {
|
|
|
|
fn next(&self, c: char) -> Option<SpaceState> {
|
|
|
|
match *self {
|
|
|
|
SpaceState::S => match c {
|
|
|
|
'p' => Some(SpaceState::Sp),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
SpaceState::Sp => match c {
|
|
|
|
'a' => Some(SpaceState::Spa),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
SpaceState::Spa => match c {
|
|
|
|
'c' => Some(SpaceState::Spac),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
SpaceState::Spac => match c {
|
|
|
|
'e' => Some(SpaceState::Space),
|
|
|
|
_ => None,
|
|
|
|
},
|
|
|
|
_ => None,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-12-25 14:20:25 -07:00
|
|
|
//
|
2016-12-25 14:33:58 -07:00
|
|
|
// UNIT TESTING
|
2016-12-25 14:20:25 -07:00
|
|
|
//
|
|
|
|
|
2016-12-25 19:23:01 -07:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use std::iter::Iterator;
|
|
|
|
use super::*;
|
2016-12-26 18:23:58 -07:00
|
|
|
use super::number::*;
|
2016-12-25 19:23:01 -07:00
|
|
|
use super::token::*;
|
|
|
|
|
|
|
|
#[test]
|
2016-12-27 10:38:57 -07:00
|
|
|
fn finds_parens() {
|
2016-12-25 20:49:15 -07:00
|
|
|
check_single_token("(", Token::LeftParen(String::from("(")));
|
|
|
|
check_single_token(")", Token::RightParen(String::from(")")));
|
2016-12-25 20:59:21 -07:00
|
|
|
check_single_token("#(", Token::LeftVectorParen);
|
2016-12-25 19:23:01 -07:00
|
|
|
}
|
2016-12-25 14:20:25 -07:00
|
|
|
|
2016-12-28 12:16:28 -07:00
|
|
|
#[test]
|
|
|
|
fn finds_characters() {
|
|
|
|
check_single_token("#\\a", Token::Character('a'));
|
|
|
|
check_single_token("#\\n", Token::Character('n'));
|
|
|
|
check_single_token("#\\s", Token::Character('s'));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn finds_characters_newline() {
|
|
|
|
check_single_token("#\\newline", Token::Character('\n'));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn finds_characters_space() {
|
|
|
|
check_single_token("#\\space", Token::Character(' '));
|
|
|
|
}
|
|
|
|
|
2016-12-25 20:54:47 -07:00
|
|
|
#[test]
|
2016-12-27 10:38:57 -07:00
|
|
|
fn finds_dots() {
|
2016-12-25 20:54:47 -07:00
|
|
|
check_single_token(".", Token::Dot);
|
2016-12-26 11:51:03 -07:00
|
|
|
|
|
|
|
let mut lexer = Lexer::new("abc . abc");
|
|
|
|
assert_next_token(&mut lexer, &Token::Identifier(String::from("abc")));
|
|
|
|
assert_next_token(&mut lexer, &Token::Dot);
|
|
|
|
assert_next_token(&mut lexer, &Token::Identifier(String::from("abc")));
|
2016-12-25 20:54:47 -07:00
|
|
|
}
|
|
|
|
|
2016-12-25 19:23:01 -07:00
|
|
|
#[test]
|
2016-12-27 10:38:57 -07:00
|
|
|
fn finds_identifiers() {
|
|
|
|
let tok = |s: &str| { check_single_token(s, Token::Identifier(String::from(s))); };
|
|
|
|
tok("abc");
|
|
|
|
tok("number?");
|
|
|
|
tok("+");
|
|
|
|
tok("-");
|
2016-12-25 19:23:01 -07:00
|
|
|
}
|
2016-12-25 14:33:58 -07:00
|
|
|
|
2016-12-25 19:23:01 -07:00
|
|
|
#[test]
|
2016-12-27 10:38:57 -07:00
|
|
|
fn finds_booleans() {
|
2016-12-25 20:49:15 -07:00
|
|
|
check_single_token("#t", Token::Boolean(true));
|
|
|
|
check_single_token("#f", Token::Boolean(false));
|
2016-12-25 19:23:01 -07:00
|
|
|
}
|
2016-12-25 14:33:58 -07:00
|
|
|
|
2016-12-25 19:23:01 -07:00
|
|
|
#[test]
|
2016-12-27 10:38:57 -07:00
|
|
|
fn finds_comments() {
|
2016-12-25 19:23:01 -07:00
|
|
|
let s = "; a comment";
|
2016-12-25 20:49:15 -07:00
|
|
|
check_single_token(s, Token::Comment(String::from(s)));
|
2016-12-25 19:23:01 -07:00
|
|
|
}
|
2016-12-25 14:20:25 -07:00
|
|
|
|
2016-12-28 08:35:02 -07:00
|
|
|
#[test]
|
|
|
|
fn finds_escaped_characters_in_strings() {
|
2016-12-28 18:01:36 -05:00
|
|
|
check_single_token("\"\\\\\"", Token::String(String::from("\\")));
|
|
|
|
check_single_token("\"\\\"\"", Token::String(String::from("\"")));
|
|
|
|
check_single_token("\"\\n\"", Token::String(String::from("\n")));
|
2016-12-28 08:35:02 -07:00
|
|
|
}
|
|
|
|
|
2016-12-26 18:23:58 -07:00
|
|
|
#[test]
|
2016-12-27 10:08:44 -07:00
|
|
|
fn finds_numbers() {
|
2016-12-26 18:23:58 -07:00
|
|
|
check_single_token(".34", Token::Number(Number::new(0.34)));
|
|
|
|
check_single_token("0.34", Token::Number(Number::new(0.34)));
|
|
|
|
}
|
|
|
|
|
2016-12-27 10:08:44 -07:00
|
|
|
#[test]
|
|
|
|
fn finds_negative_numbers() {
|
|
|
|
check_single_token("-3", Token::Number(Number::from_int(-3)));
|
|
|
|
check_single_token("-0", Token::Number(Number::from_int(-0)));
|
|
|
|
check_single_token("-0.56", Token::Number(Number::new(-0.56)));
|
|
|
|
check_single_token("-3.14159", Token::Number(Number::new(-3.14159)));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn finds_bin_numbers() {
|
|
|
|
check_single_token("#b0", Token::Number(Number::from_int(0b0)));
|
|
|
|
check_single_token("#b01011", Token::Number(Number::from_int(0b01011)));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn finds_dec_numbers() {
|
|
|
|
check_single_token("34", Token::Number(Number::new(34.0)));
|
|
|
|
check_single_token("#d89", Token::Number(Number::from_int(89)));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn finds_oct_numbers() {
|
|
|
|
check_single_token("#o45", Token::Number(Number::from_int(0o45)));
|
|
|
|
}
|
|
|
|
|
2016-12-27 10:14:03 -07:00
|
|
|
#[test]
|
|
|
|
fn finds_exact_numbers() {
|
|
|
|
check_single_token("#e45", Token::Number(Number::from_int(45)));
|
|
|
|
check_single_token("#e-45", Token::Number(Number::from_int(-45)));
|
|
|
|
}
|
|
|
|
|
2016-12-27 10:08:44 -07:00
|
|
|
#[test]
|
|
|
|
fn finds_hex_numbers() {
|
|
|
|
check_single_token("#h4A65", Token::Number(Number::from_int(0x4A65)));
|
|
|
|
}
|
|
|
|
|
2016-12-28 10:57:16 -07:00
|
|
|
#[test]
|
|
|
|
fn finds_quote() {
|
|
|
|
check_single_token("'", Token::Quote);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn finds_strings() {
|
2016-12-28 18:01:36 -05:00
|
|
|
check_single_token("\"\"", Token::String(String::from("")));
|
|
|
|
check_single_token("\"abc\"", Token::String(String::from("abc")));
|
2016-12-28 10:57:16 -07:00
|
|
|
}
|
|
|
|
|
2016-12-27 10:39:07 -07:00
|
|
|
#[test]
|
2016-12-28 08:37:04 -07:00
|
|
|
fn lexes_simple_expression() {
|
2016-12-28 17:41:35 -05:00
|
|
|
check_tokens("(+ 3.4 6.8)", vec![
|
|
|
|
Token::LeftParen(String::from("(")),
|
|
|
|
Token::Identifier(String::from("+")),
|
|
|
|
Token::Number(Number::new(3.4)),
|
|
|
|
Token::Number(Number::new(6.8)),
|
|
|
|
Token::RightParen(String::from(")"))]);
|
2016-12-27 10:39:07 -07:00
|
|
|
}
|
|
|
|
|
2016-12-28 10:57:16 -07:00
|
|
|
#[test]
|
|
|
|
fn lexes_quoted_identifier() {
|
2016-12-28 17:41:35 -05:00
|
|
|
check_tokens("'abc", vec![Token::Quote, Token::Identifier(String::from("abc"))]);
|
2016-12-28 10:57:16 -07:00
|
|
|
}
|
|
|
|
|
2016-12-25 20:49:15 -07:00
|
|
|
fn check_single_token(input: &str, expected: Token) {
|
|
|
|
let mut lexer = Lexer::new(input);
|
|
|
|
assert_next_token(&mut lexer, &expected);
|
2016-12-25 19:23:01 -07:00
|
|
|
}
|
2016-12-25 15:03:18 -07:00
|
|
|
|
2016-12-28 17:41:35 -05:00
|
|
|
fn check_tokens(input: &str, expected: Vec<Token>) {
|
|
|
|
let lexer = Lexer::new(input);
|
|
|
|
let mut expected_iter = expected.iter();
|
|
|
|
for lex in lexer {
|
|
|
|
if let Some(expected_token) = expected_iter.next() {
|
|
|
|
assert_eq!(lex.token, *expected_token);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
assert!(false, "Found a token we didn't expect: {:?}", lex.token);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// TODO: Check that all expected tokens are consumed.
|
|
|
|
}
|
|
|
|
|
2016-12-25 19:23:01 -07:00
|
|
|
fn assert_next_token(lexer: &mut Lexer, expected: &Token) {
|
|
|
|
let lex = lexer.next().unwrap();
|
|
|
|
assert_eq!(lex.token, *expected);
|
|
|
|
}
|
2016-12-25 14:20:25 -07:00
|
|
|
}
|