Switch to using token indexes instead of strings

2014-10-24 09:55:49 -07:00 · 2014-10-24 09:55:49 -07:00 · 3b1d6f856d
commit 3b1d6f856d
parent 6ae1dd5dac
4 changed files with 79 additions and 49 deletions
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -69,9 +69,9 @@ module Ebooks
      Stemmer::stem_word(word.downcase)
    end

-    def self.keywords(sentences)
+    def self.keywords(text)
      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
-      text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
+      text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')

      text = Highscore::Content.new(text)

@ -91,11 +91,12 @@ module Ebooks
    end

    # Takes a list of tokens and builds a nice-looking sentence
-    def self.reconstruct(tokens)
+    def self.reconstruct(tikis, tokens)
      text = ""
      last_token = nil
-      tokens.each do |token|
-        next if token == INTERIM
+      tikis.each do |tiki|
+        next if tiki == INTERIM
+        token = tokens[tiki]
        text += ' ' if last_token && space_between?(last_token, token)
        text += token
        last_token = token