2.0.8 -- different generation algorithm

2013-11-14 07:44:05 -08:00 · 2013-11-14 07:44:05 -08:00 · 00f0228dd4
commit 00f0228dd4
parent e87dc5862b
24 changed files with 127 additions and 27 deletions
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -61,7 +61,7 @@ module Ebooks
    # As above, this is ad hoc because tokenization libraries
    # do not behave well wrt. things like emoticons and timestamps
    def self.tokenize(sentence)
-      regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
+      regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
      sentence.split(regex)
    end

@ -150,5 +150,12 @@ module Ebooks

      false
    end
+
+    # Determine if a2 is a subsequence of a1
+    def self.subseq?(a1, a2)
+      a1.each_index.find do |i|
+        a1[i...i+a2.length] == a2
+      end
+    end
  end
 end