2.0.8 -- different generation algorithm

This commit is contained in:
Mispy 2013-11-14 07:44:05 -08:00
parent e87dc5862b
commit 00f0228dd4
24 changed files with 127 additions and 27 deletions

9
lib/twitter_ebooks/nlp.rb Normal file → Executable file
View file

@ -61,7 +61,7 @@ module Ebooks
# As above, this is ad hoc because tokenization libraries
# do not behave well wrt. things like emoticons and timestamps
def self.tokenize(sentence)
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
sentence.split(regex)
end
@ -150,5 +150,12 @@ module Ebooks
false
end
# Determine if a2 is a subsequence of a1
def self.subseq?(a1, a2)
a1.each_index.find do |i|
a1[i...i+a2.length] == a2
end
end
end
end