From d4dee92904c753120b4ca8f34a1710f5ca1d2c0b Mon Sep 17 00:00:00 2001 From: Eryn Wells Date: Sat, 24 Dec 2016 08:57:37 -0700 Subject: [PATCH] Move all the lexer stuff to a module directory --- src/lexer/char.rs | 31 +++++++++ src/lexer/charset.rs | 43 ++++++++++++ src/lexer/mod.rs | 12 ++-- src/{characters.rs => lexer/str.rs} | 101 ++++------------------------ src/lexer/token.rs | 2 +- 5 files changed, 97 insertions(+), 92 deletions(-) create mode 100644 src/lexer/char.rs create mode 100644 src/lexer/charset.rs rename src/{characters.rs => lexer/str.rs} (56%) diff --git a/src/lexer/char.rs b/src/lexer/char.rs new file mode 100644 index 0000000..67534f3 --- /dev/null +++ b/src/lexer/char.rs @@ -0,0 +1,31 @@ +/* char.rs + * Eryn Wells + */ + +use lexer::charset; + +pub trait Lexable { + fn is_left_paren(&self) -> bool; + fn is_right_paren(&self) -> bool; + fn is_identifier_initial(&self) -> bool; + fn is_identifier_subsequent(&self) -> bool; +} + +impl Lexable for char { + fn is_left_paren(&self) -> bool { + self == &'(' + } + + fn is_right_paren(&self) -> bool { + self == &')' + } + + fn is_identifier_initial(&self) -> bool { + charset::identifier_initials().contains(&self) + } + + fn is_identifier_subsequent(&self) -> bool { + charset::identifier_subsequents().contains(&self) + } +} + diff --git a/src/lexer/charset.rs b/src/lexer/charset.rs new file mode 100644 index 0000000..0e8b1c1 --- /dev/null +++ b/src/lexer/charset.rs @@ -0,0 +1,43 @@ +/// Character Sets +/// +/// Sets of characters valid for making up tokens. + +use std::collections::HashSet; +use std::iter::FromIterator; + +pub type CharSet = HashSet; + +// TODO: Use std::sync::Once for these sets? +// https://doc.rust-lang.org/beta/std/sync/struct.Once.html + +fn ascii_letters() -> CharSet { + let letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".chars(); + CharSet::from_iter(letters) +} + +fn ascii_digits() -> CharSet { + let digits = "1234567890".chars(); + CharSet::from_iter(digits) +} + +/// A set of all characters allowed to start Scheme identifiers. +pub fn identifier_initials() -> CharSet { + let letters = ascii_letters(); + let extras = CharSet::from_iter("!$%&*/:<=>?~_^".chars()); + let mut initials = CharSet::new(); + initials.extend(letters.iter()); + initials.extend(extras.iter()); + initials +} + +/// A set of all characters allowed to follow an identifier initial. +pub fn identifier_subsequents() -> CharSet { + let initials = identifier_initials(); + let digits = ascii_digits(); + let extras = CharSet::from_iter(".+-".chars()); + let mut subsequents = CharSet::new(); + subsequents.extend(initials.iter()); + subsequents.extend(digits.iter()); + subsequents.extend(extras.iter()); + subsequents +} diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index d447bb3..a440dce 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,11 +1,15 @@ //! # Lexer pub mod token; +mod char; +mod charset; +mod str; -use characters; -use characters::CharAt; -use characters::Lexable; -use characters::RelativeIndexable; +use self::char::Lexable; +use self::str::CharAt; +use self::str::RelativeIndexable; +use self::token::Token; +use self::token::Kind; enum State { Initial, diff --git a/src/characters.rs b/src/lexer/str.rs similarity index 56% rename from src/characters.rs rename to src/lexer/str.rs index 3ec2f9d..7a7a239 100644 --- a/src/characters.rs +++ b/src/lexer/str.rs @@ -1,79 +1,6 @@ -//! Characters -//! -//! Utilities for dealing with chars of various sorts. - -use std::collections::HashSet; -use std::iter::FromIterator; - -pub type CharSet = HashSet; - -// TODO: Use std::sync::Once for these sets? -// https://doc.rust-lang.org/beta/std/sync/struct.Once.html - -fn ascii_letters() -> CharSet { - let letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".chars(); - CharSet::from_iter(letters) -} - -fn ascii_digits() -> CharSet { - let digits = "1234567890".chars(); - CharSet::from_iter(digits) -} - -/// A set of all characters allowed to start Scheme identifiers. -pub fn identifier_initials() -> CharSet { - let letters = ascii_letters(); - let extras = CharSet::from_iter("!$%&*/:<=>?~_^".chars()); - let mut initials = CharSet::new(); - initials.extend(letters.iter()); - initials.extend(extras.iter()); - initials -} - -/// A set of all characters allowed to follow an identifier initial. -pub fn identifier_subsequents() -> CharSet { - let initials = identifier_initials(); - let digits = ascii_digits(); - let extras = CharSet::from_iter(".+-".chars()); - let mut subsequents = CharSet::new(); - subsequents.extend(initials.iter()); - subsequents.extend(digits.iter()); - subsequents.extend(extras.iter()); - subsequents -} - -// -// char -// - -pub trait Lexable { - fn is_left_paren(&self) -> bool; - fn is_right_paren(&self) -> bool; - fn is_identifier_initial(&self) -> bool; - fn is_identifier_subsequent(&self) -> bool; -} - -impl Lexable for char { - fn is_left_paren(&self) -> bool { - self == &'(' - } - - fn is_right_paren(&self) -> bool { - self == &')' - } - - fn is_identifier_initial(&self) -> bool { - identifier_initials().contains(&self) - } - - fn is_identifier_subsequent(&self) -> bool { - identifier_subsequents().contains(&self) - } -} - -// -// str and String -// +/* str.rs + * Eryn Wells + */ pub trait RelativeIndexable { /// Get the index of the character boundary preceding the given index. The index does not need to be on a character @@ -124,6 +51,17 @@ impl RelativeIndexable for str { } } +impl CharAt for str { + fn char_at(&self, index: usize) -> Option { + if !self.is_char_boundary(index) { + return None; + } + let end = self.index_after(index); + let char_str = &self[index .. end]; + char_str.chars().nth(0) + } +} + #[test] fn index_before_is_well_behaved_for_ascii() { let s = "abc"; @@ -158,14 +96,3 @@ fn index_after_is_well_behaved_for_ascii() { assert!(s.is_char_boundary(idx)); } } - -impl CharAt for str { - fn char_at(&self, index: usize) -> Option { - if !self.is_char_boundary(index) { - return None; - } - let end = self.index_after(index); - let char_str = &self[index .. end]; - char_str.chars().nth(0) - } -} diff --git a/src/lexer/token.rs b/src/lexer/token.rs index 5e88b9b..240a02e 100644 --- a/src/lexer/token.rs +++ b/src/lexer/token.rs @@ -14,7 +14,7 @@ pub struct Token { } impl Token { - fn new(kind: Kind, value: String) -> Token { + pub fn new(kind: Kind, value: String) -> Token { Token { kind: kind, value: value, } } }