2016-12-24 09:05:10 -07:00
|
|
|
/* lexer.rs
|
|
|
|
* Eryn Wells <eryn@erynwells.me>
|
|
|
|
*/
|
2016-12-19 22:23:27 -08:00
|
|
|
|
2016-12-24 08:47:02 -07:00
|
|
|
pub mod token;
|
2016-12-24 08:57:37 -07:00
|
|
|
mod char;
|
|
|
|
mod charset;
|
|
|
|
mod str;
|
2016-12-23 17:45:37 -07:00
|
|
|
|
2016-12-24 08:57:37 -07:00
|
|
|
use self::char::Lexable;
|
|
|
|
use self::str::CharAt;
|
|
|
|
use self::str::RelativeIndexable;
|
2016-12-25 12:24:04 -07:00
|
|
|
use self::token::Lex;
|
2016-12-24 08:57:37 -07:00
|
|
|
use self::token::Token;
|
2016-12-19 22:23:27 -08:00
|
|
|
|
2016-12-24 14:03:37 -07:00
|
|
|
#[derive(Debug)]
|
2016-12-22 09:25:31 -08:00
|
|
|
enum State {
|
|
|
|
Initial,
|
|
|
|
Identifier,
|
2016-12-24 10:29:10 -07:00
|
|
|
Hash,
|
2016-12-25 13:50:34 -07:00
|
|
|
Comment,
|
2016-12-25 15:03:18 -07:00
|
|
|
String,
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
|
2016-12-20 17:52:29 -08:00
|
|
|
pub struct Lexer {
|
|
|
|
input: String,
|
2016-12-22 09:25:31 -08:00
|
|
|
begin: usize,
|
|
|
|
forward: usize,
|
2016-12-24 09:17:08 -07:00
|
|
|
line: u32,
|
2016-12-22 09:25:31 -08:00
|
|
|
state: State,
|
2016-12-20 17:38:44 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Lexer {
|
2016-12-25 14:33:58 -07:00
|
|
|
pub fn new(input: &str) -> Lexer {
|
2016-12-22 09:25:31 -08:00
|
|
|
Lexer {
|
2016-12-25 14:33:58 -07:00
|
|
|
input: String::from(input),
|
2016-12-22 09:25:31 -08:00
|
|
|
begin: 0,
|
|
|
|
forward: 0,
|
2016-12-24 09:17:08 -07:00
|
|
|
line: 1,
|
2016-12-22 09:25:31 -08:00
|
|
|
state: State::Initial,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Lexer {
|
|
|
|
fn begin_lexing(&mut self) {
|
|
|
|
self.forward = self.begin;
|
|
|
|
self.state = State::Initial;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Advance the forward pointer to the next character.
|
|
|
|
fn advance(&mut self) {
|
2016-12-23 17:46:28 -07:00
|
|
|
self.forward = self.input.index_after(self.forward);
|
|
|
|
println!("> forward={}", self.forward);
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Retract the forward pointer to the previous character.
|
|
|
|
fn retract(&mut self) {
|
2016-12-23 17:46:28 -07:00
|
|
|
self.forward = self.input.index_before(self.forward);
|
|
|
|
println!("< forward={}", self.forward);
|
|
|
|
}
|
|
|
|
|
2016-12-23 17:53:28 -07:00
|
|
|
/// Advance the begin pointer to prepare for the next iteration.
|
2016-12-23 17:46:28 -07:00
|
|
|
fn advance_begin(&mut self) {
|
|
|
|
self.begin = self.input.index_after(self.forward);
|
2016-12-24 09:59:35 -07:00
|
|
|
self.forward = self.begin;
|
|
|
|
println!("> begin={}, forward={}", self.begin, self.forward);
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
|
|
|
|
2016-12-25 13:50:34 -07:00
|
|
|
fn handle_newline(&mut self) {
|
|
|
|
self.line += 1;
|
|
|
|
}
|
|
|
|
|
2016-12-24 09:05:10 -07:00
|
|
|
/// Get the substring between the two input indexes. This is the value to give to a new Token instance.
|
2016-12-23 17:46:28 -07:00
|
|
|
fn value(&self) -> String {
|
|
|
|
self.input[self.begin .. self.forward].to_string()
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Lexer {
|
2016-12-23 17:53:28 -07:00
|
|
|
/// Handle self.state == State::Initial
|
2016-12-25 14:20:16 -07:00
|
|
|
fn state_initial(&mut self, c: char, token: &mut Option<Token>) {
|
2016-12-23 17:46:28 -07:00
|
|
|
if c.is_left_paren() {
|
2016-12-25 14:20:16 -07:00
|
|
|
*token = Some(Token::LeftParen(c.to_string()));
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
|
|
|
else if c.is_right_paren() {
|
2016-12-25 14:20:16 -07:00
|
|
|
*token = Some(Token::RightParen(c.to_string()));
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
2016-12-25 15:03:18 -07:00
|
|
|
else if c.is_hash() {
|
|
|
|
self.state = State::Hash;
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else if c.is_string_quote() {
|
|
|
|
self.state = State::String;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-24 10:29:10 -07:00
|
|
|
|
2016-12-24 09:07:38 -07:00
|
|
|
else if c.is_identifier_single() {
|
2016-12-25 14:20:16 -07:00
|
|
|
*token = Some(Token::Identifier(c.to_string()));
|
2016-12-24 09:07:38 -07:00
|
|
|
}
|
2016-12-23 17:46:28 -07:00
|
|
|
else if c.is_identifier_initial() {
|
|
|
|
self.state = State::Identifier;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-24 10:29:10 -07:00
|
|
|
|
2016-12-24 09:17:08 -07:00
|
|
|
else if c.is_whitespace() {
|
|
|
|
if c.is_newline() {
|
2016-12-25 13:50:34 -07:00
|
|
|
self.handle_newline();
|
2016-12-24 09:17:08 -07:00
|
|
|
}
|
2016-12-24 09:59:35 -07:00
|
|
|
self.advance_begin();
|
2016-12-24 09:17:08 -07:00
|
|
|
}
|
2016-12-25 14:20:16 -07:00
|
|
|
|
|
|
|
else if c.is_comment_initial() {
|
|
|
|
self.state = State::Comment;
|
|
|
|
self.advance();
|
|
|
|
}
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
|
|
|
|
2016-12-23 17:53:28 -07:00
|
|
|
/// Handle self.state == State::Identifier
|
2016-12-25 14:20:16 -07:00
|
|
|
fn state_identifier(&mut self, c: char, token: &mut Option<Token>) {
|
2016-12-23 17:46:28 -07:00
|
|
|
if c.is_identifier_subsequent() {
|
|
|
|
// State in Identifier state.
|
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
else {
|
2016-12-25 14:20:16 -07:00
|
|
|
*token = Some(Token::Identifier(self.value()));
|
2016-12-23 17:46:28 -07:00
|
|
|
self.retract();
|
|
|
|
}
|
2016-12-20 17:52:29 -08:00
|
|
|
}
|
2016-12-24 10:29:10 -07:00
|
|
|
|
2016-12-25 14:20:16 -07:00
|
|
|
fn state_hash(&mut self, c: char, token: &mut Option<Token>) {
|
2016-12-25 12:24:04 -07:00
|
|
|
if c.is_boolean_true() || c.is_boolean_false() {
|
2016-12-24 10:29:10 -07:00
|
|
|
self.advance();
|
2016-12-25 14:20:16 -07:00
|
|
|
*token = Some(Token::Boolean(c.is_boolean_true()));
|
2016-12-24 10:29:10 -07:00
|
|
|
}
|
|
|
|
}
|
2016-12-25 13:50:34 -07:00
|
|
|
|
2016-12-25 15:03:18 -07:00
|
|
|
fn state_string(&mut self, c: char, token: &mut Option<Token>) {
|
|
|
|
self.advance();
|
|
|
|
if c.is_string_quote() {
|
|
|
|
*token = Some(Token::String(self.value()));
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-25 14:20:16 -07:00
|
|
|
fn state_comment(&mut self, c: char, token: &mut Option<Token>) {
|
2016-12-25 13:50:34 -07:00
|
|
|
if c.is_newline() {
|
|
|
|
self.handle_newline();
|
2016-12-25 14:20:16 -07:00
|
|
|
*token = Some(Token::Comment(self.value()));
|
2016-12-25 13:50:34 -07:00
|
|
|
}
|
2016-12-25 14:34:11 -07:00
|
|
|
else if c.is_eof() {
|
|
|
|
*token = Some(Token::Comment(self.value()));
|
|
|
|
}
|
|
|
|
self.advance();
|
2016-12-25 13:50:34 -07:00
|
|
|
}
|
2016-12-20 17:38:44 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Iterator for Lexer {
|
2016-12-25 12:24:04 -07:00
|
|
|
type Item = Lex;
|
2016-12-20 17:38:44 -08:00
|
|
|
|
2016-12-25 12:24:04 -07:00
|
|
|
fn next(&mut self) -> Option<Lex> {
|
2016-12-22 09:25:31 -08:00
|
|
|
self.begin_lexing();
|
2016-12-23 17:46:28 -07:00
|
|
|
if self.begin == self.input.len() {
|
|
|
|
return None;
|
|
|
|
}
|
2016-12-25 14:20:16 -07:00
|
|
|
let mut token: Option<Token> = None;
|
2016-12-23 17:46:28 -07:00
|
|
|
println!("Lexing '{}'", &self.input[self.begin ..]);
|
2016-12-25 14:20:16 -07:00
|
|
|
while token.is_none() {
|
|
|
|
let c = match self.input.char_at(self.forward) {
|
|
|
|
Some(c) => c,
|
|
|
|
None => '\0',
|
|
|
|
};
|
|
|
|
println!("{:?}! c='{}'", self.state, c);
|
|
|
|
match self.state {
|
|
|
|
State::Initial => self.state_initial(c, &mut token),
|
|
|
|
State::Identifier => self.state_identifier(c, &mut token),
|
|
|
|
State::Hash => self.state_hash(c, &mut token),
|
2016-12-25 15:03:18 -07:00
|
|
|
State::String => self.state_string(c, &mut token),
|
2016-12-25 14:20:16 -07:00
|
|
|
State::Comment => self.state_comment(c, &mut token),
|
2016-12-23 17:46:28 -07:00
|
|
|
}
|
2016-12-22 09:25:31 -08:00
|
|
|
}
|
2016-12-23 17:46:28 -07:00
|
|
|
self.advance_begin();
|
2016-12-25 14:41:06 -07:00
|
|
|
match token {
|
|
|
|
Some(t) => Some(Lex::new(t)),
|
|
|
|
None => None,
|
2016-12-25 14:20:16 -07:00
|
|
|
}
|
2016-12-20 17:38:44 -08:00
|
|
|
}
|
|
|
|
}
|
2016-12-25 14:20:25 -07:00
|
|
|
|
|
|
|
//
|
2016-12-25 14:33:58 -07:00
|
|
|
// UNIT TESTING
|
2016-12-25 14:20:25 -07:00
|
|
|
//
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lexer_finds_parens() {
|
2016-12-25 14:33:58 -07:00
|
|
|
let mut lexer = Lexer::new("()");
|
2016-12-25 14:20:25 -07:00
|
|
|
assert_next_token(&mut lexer, &Token::LeftParen("(".to_string()));
|
|
|
|
assert_next_token(&mut lexer, &Token::RightParen(")".to_string()));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lexer_finds_identifiers() {
|
2016-12-25 14:33:58 -07:00
|
|
|
let s = "abc";
|
|
|
|
let mut lexer = Lexer::new(s);
|
|
|
|
assert_next_token(&mut lexer, &Token::Identifier(s.to_string()));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lexer_finds_booleans() {
|
|
|
|
let mut lexer = Lexer::new("#t #f");
|
|
|
|
assert_next_token(&mut lexer, &Token::Boolean(true));
|
|
|
|
assert_next_token(&mut lexer, &Token::Boolean(false));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lexer_finds_comments() {
|
|
|
|
let s = "; a comment";
|
|
|
|
let mut lexer = Lexer::new(s);
|
|
|
|
assert_next_token(&mut lexer, &Token::Comment(s.to_string()));
|
2016-12-25 14:20:25 -07:00
|
|
|
}
|
|
|
|
|
2016-12-25 15:03:18 -07:00
|
|
|
#[test]
|
|
|
|
fn lexer_finds_strings() {
|
|
|
|
let mut lexer = Lexer::new("\"\"");
|
|
|
|
assert_next_token(&mut lexer, &Token::String("\"\"".to_string()));
|
|
|
|
let mut lexer = Lexer::new("\"abc\"");
|
|
|
|
assert_next_token(&mut lexer, &Token::String("\"abc\"".to_string()));
|
|
|
|
}
|
|
|
|
|
2016-12-25 14:20:25 -07:00
|
|
|
fn assert_next_token(lexer: &mut Lexer, expected: &Token) {
|
|
|
|
let lex = lexer.next().unwrap();
|
|
|
|
assert_eq!(lex.token, *expected);
|
|
|
|
}
|