2.0.8 -- different generation algorithm
This commit is contained in:
parent
e87dc5862b
commit
00f0228dd4
24 changed files with 127 additions and 27 deletions
9
lib/twitter_ebooks/nlp.rb
Normal file → Executable file
9
lib/twitter_ebooks/nlp.rb
Normal file → Executable file
|
@ -61,7 +61,7 @@ module Ebooks
|
|||
# As above, this is ad hoc because tokenization libraries
|
||||
# do not behave well wrt. things like emoticons and timestamps
|
||||
def self.tokenize(sentence)
|
||||
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
|
||||
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
||||
sentence.split(regex)
|
||||
end
|
||||
|
||||
|
@ -150,5 +150,12 @@ module Ebooks
|
|||
|
||||
false
|
||||
end
|
||||
|
||||
# Determine if a2 is a subsequence of a1
|
||||
def self.subseq?(a1, a2)
|
||||
a1.each_index.find do |i|
|
||||
a1[i...i+a2.length] == a2
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue