Move all the lexer stuff to a module directory

2016-12-24 08:57:37 -07:00 · 2016-12-24 08:57:37 -07:00 · d4dee92904
commit d4dee92904
parent d333819dee
5 changed files with 97 additions and 92 deletions
--- a/src/lexer/char.rs
+++ b/src/lexer/char.rs
@ -0,0 +1,31 @@
+/* char.rs
+ * Eryn Wells <eryn@erynwells.me>
+ */
+
+use lexer::charset;
+
+pub trait Lexable {
+    fn is_left_paren(&self) -> bool;
+    fn is_right_paren(&self) -> bool;
+    fn is_identifier_initial(&self) -> bool;
+    fn is_identifier_subsequent(&self) -> bool;
+}
+
+impl Lexable for char {
+    fn is_left_paren(&self) -> bool {
+        self == &'('
+    }
+
+    fn is_right_paren(&self) -> bool {
+        self == &')'
+    }
+
+    fn is_identifier_initial(&self) -> bool {
+        charset::identifier_initials().contains(&self)
+    }
+
+    fn is_identifier_subsequent(&self) -> bool {
+        charset::identifier_subsequents().contains(&self)
+    }
+}
+
--- a/src/lexer/charset.rs
+++ b/src/lexer/charset.rs
@ -0,0 +1,43 @@
+/// Character Sets
+///
+/// Sets of characters valid for making up tokens.
+
+use std::collections::HashSet;
+use std::iter::FromIterator;
+
+pub type CharSet = HashSet<char>;
+
+// TODO: Use std::sync::Once for these sets?
+// https://doc.rust-lang.org/beta/std/sync/struct.Once.html
+
+fn ascii_letters() -> CharSet {
+    let letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".chars();
+    CharSet::from_iter(letters)
+}
+
+fn ascii_digits() -> CharSet {
+    let digits = "1234567890".chars();
+    CharSet::from_iter(digits)
+}
+
+/// A set of all characters allowed to start Scheme identifiers.
+pub fn identifier_initials() -> CharSet {
+    let letters = ascii_letters();
+    let extras = CharSet::from_iter("!$%&*/:<=>?~_^".chars());
+    let mut initials = CharSet::new();
+    initials.extend(letters.iter());
+    initials.extend(extras.iter());
+    initials
+}
+
+/// A set of all characters allowed to follow an identifier initial.
+pub fn identifier_subsequents() -> CharSet {
+    let initials = identifier_initials();
+    let digits = ascii_digits();
+    let extras = CharSet::from_iter(".+-".chars());
+    let mut subsequents = CharSet::new();
+    subsequents.extend(initials.iter());
+    subsequents.extend(digits.iter());
+    subsequents.extend(extras.iter());
+    subsequents
+}
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@ -1,11 +1,15 @@
 //! # Lexer

 pub mod token;
+mod char;
+mod charset;
+mod str;

-use characters;
-use characters::CharAt;
-use characters::Lexable;
-use characters::RelativeIndexable;
+use self::char::Lexable;
+use self::str::CharAt;
+use self::str::RelativeIndexable;
+use self::token::Token;
+use self::token::Kind;

 enum State {
    Initial,
--- a/src/lexer/str.rs
+++ b/src/lexer/str.rs
@ -0,0 +1,98 @@
+/* str.rs
+ * Eryn Wells <eryn@erynwells.me>
+ */
+
+pub trait RelativeIndexable {
+    /// Get the index of the character boundary preceding the given index. The index does not need to be on a character
+    /// boundary.
+    fn index_before(&self, usize) -> usize;
+
+    /// Get the index of the character boundary following the given index. The index does not need to be on a character
+    /// boundary.
+    fn index_after(&self, usize) -> usize;
+}
+
+pub trait CharAt {
+    /// Get the character at the given byte index. This index must be at a character boundary as defined by
+    /// `is_char_boundary()`.
+    fn char_at(&self, usize) -> Option<char>;
+}
+
+impl RelativeIndexable for str {
+    fn index_before(&self, index: usize) -> usize {
+        if index == 0 {
+            return 0;
+        }
+        let mut index = index;
+        if index > self.len() {
+            index = self.len();
+        }
+        loop {
+            index -= 1;
+            if self.is_char_boundary(index) {
+                break;
+            }
+        }
+        index
+    }
+
+    fn index_after(&self, index: usize) -> usize {
+        if index >= self.len() {
+            return self.len();
+        }
+        let mut index = index;
+        loop {
+            index += 1;
+            if self.is_char_boundary(index) {
+                break;
+            }
+        }
+        index
+    }
+}
+
+impl CharAt for str {
+    fn char_at(&self, index: usize) -> Option<char> {
+        if !self.is_char_boundary(index) {
+            return None;
+        }
+        let end = self.index_after(index);
+        let char_str = &self[index .. end];
+        char_str.chars().nth(0)
+    }
+}
+
+#[test]
+fn index_before_is_well_behaved_for_ascii() {
+    let s = "abc";
+
+    // Sanity
+    assert_eq!(s.index_before(0), 0);
+    assert_eq!(s.index_before(2), 1);
+
+    // An index beyond the string bounds returns the index of the last character in the string.
+    {
+        let idx = s.index_before(4);
+        assert_eq!(idx, 2);
+        assert!(s.is_char_boundary(idx));
+        let last_char = &s[idx ..];
+        assert_eq!(last_char.len(), 1);
+        assert_eq!(last_char.chars().nth(0), Some('c'));
+    }
+}
+
+#[test]
+fn index_after_is_well_behaved_for_ascii() {
+    let s = "abc";
+
+    // Sanity
+    assert_eq!(s.index_after(0), 1);
+    assert_eq!(s.index_after(2), 3);
+
+    // An index beyond the string bounds returns the length of the string
+    {
+        let idx = s.index_after(4);
+        assert_eq!(idx, s.len());
+        assert!(s.is_char_boundary(idx));
+    }
+}
--- a/src/lexer/token.rs
+++ b/src/lexer/token.rs
@ -14,7 +14,7 @@ pub struct Token {
 }

 impl Token {
-    fn new(kind: Kind, value: String) -> Token {
+    pub fn new(kind: Kind, value: String) -> Token {
        Token { kind: kind, value: value, }
    }
 }