Switch to using token indexes instead of strings

2014-10-24 09:55:49 -07:00 · 2014-10-24 09:55:49 -07:00 · 3b1d6f856d
commit 3b1d6f856d
parent 6ae1dd5dac
4 changed files with 79 additions and 49 deletions
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -18,18 +18,31 @@ module Ebooks
      Marshal.load(File.open(path, 'rb') { |f| f.read })
    end
-    def mass_tokenize(text)
+    def initialize
-      sentences = NLP.sentences(text)
+      # This is the only source of actual strings in the model. It is
-      tokens = []
+      # an array of unique tokens. Manipulation of a token is mostly done
      # using its index in this array, which we call a "tiki"
      @tokens = []
-      sentences.each do |s|
+      # Reverse lookup tiki by token, for faster generation
-        tokens << NLP.tokenize(s).reject do |t|
+      @tikis = {}
    end
    def tikify(token)
      @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
    end
    def mass_tikify(text)
      sentences = NLP.sentences(text)
      sentences.map do |s|
        tokens = NLP.tokenize(s).reject do |t|
          # Don't include usernames/urls as tokens
          t.include?('@') || t.include?('http')
        end
      end
-      tokens
+        tokens.map { |t| tikify(t) }
      end
    end
    def consume(path)
@ -76,11 +89,11 @@ module Ebooks
      log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
-      @sentences = mass_tokenize(text)
+      @sentences = mass_tikify(text)
-      @mentions = mass_tokenize(mention_text)
+      @mentions = mass_tikify(mention_text)
      log "Ranking keywords"
-      @keywords = NLP.keywords(@sentences)
+      @keywords = NLP.keywords(text)
      self
    end
@ -106,8 +119,8 @@ module Ebooks
      NLP.htmlentities.decode tweet
    end
-    def valid_tweet?(tokens, limit)
+    def valid_tweet?(tikis, limit)
-      tweet = NLP.reconstruct(tokens)
+      tweet = NLP.reconstruct(tikis, @tokens)
      tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
    end
@ -118,24 +131,24 @@ module Ebooks
      retries = 0
      tweet = ""
-      while (tokens = generator.generate(3, :bigrams)) do
+      while (tikis = generator.generate(3, :bigrams)) do
-        next if tokens.length <= 3 && !responding
+        next if tikis.length <= 3 && !responding
-        break if valid_tweet?(tokens, limit)
+        break if valid_tweet?(tikis, limit)
        retries += 1
        break if retries >= retry_limit
      end
-      if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
+      if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
-        while (tokens = generator.generate(3, :unigrams)) do
+        while (tikis = generator.generate(3, :unigrams)) do
-          break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
+          break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
          retries += 1
          break if retries >= retry_limit
        end
      end
-      tweet = NLP.reconstruct(tokens)
+      tweet = NLP.reconstruct(tikis, @tokens)
      if retries >= retry_limit
        log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
@ -159,7 +172,7 @@ module Ebooks
      sentences.each do |sent|
        tokenized.each do |token|
-          if sent.map(&:downcase).include?(token)
+          if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
            relevant << sent unless NLP.stopword?(token)
            slightly_relevant << sent
          end
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -69,9 +69,9 @@ module Ebooks
      Stemmer::stem_word(word.downcase)
    end
-    def self.keywords(sentences)
+    def self.keywords(text)
      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
-      text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
+      text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
      text = Highscore::Content.new(text)
@ -91,11 +91,12 @@ module Ebooks
    end
    # Takes a list of tokens and builds a nice-looking sentence
-    def self.reconstruct(tokens)
+    def self.reconstruct(tikis, tokens)
      text = ""
      last_token = nil
-      tokens.each do |token|
+      tikis.each do |tiki|
-        next if token == INTERIM
+        next if tiki == INTERIM
        token = tokens[tiki]
        text += ' ' if last_token && space_between?(last_token, token)
        text += token
        last_token = token
--- a/lib/twitter_ebooks/suffix.rb
+++ b/lib/twitter_ebooks/suffix.rb
@ -15,24 +15,24 @@ module Ebooks
      @unigrams = {}
      @bigrams = {}
-      @sentences.each_with_index do |tokens, i|
+      @sentences.each_with_index do |tikis, i|
-        last_token = INTERIM
+        last_tiki = INTERIM
-        tokens.each_with_index do |token, j|
+        tikis.each_with_index do |tiki, j|
-          @unigrams[last_token] ||= []
+          @unigrams[last_tiki] ||= []
-          @unigrams[last_token] << [i, j]
+          @unigrams[last_tiki] << [i, j]
-          @bigrams[last_token] ||= {}
+          @bigrams[last_tiki] ||= {}
-          @bigrams[last_token][token] ||= []
+          @bigrams[last_tiki][tiki] ||= []
-          if j == tokens.length-1 # Mark sentence endings
+          if j == tikis.length-1 # Mark sentence endings
-            @unigrams[token] ||= []
+            @unigrams[tiki] ||= []
-            @unigrams[token] << [i, INTERIM]
+            @unigrams[tiki] << [i, INTERIM]
-            @bigrams[last_token][token] << [i, INTERIM]
+            @bigrams[last_tiki][tiki] << [i, INTERIM]
          else
-            @bigrams[last_token][token] << [i, j+1]
+            @bigrams[last_tiki][tiki] << [i, j+1]
          end
-          last_token = token
+          last_tiki = tiki
        end
      end
@ -41,19 +41,18 @@ module Ebooks
    def generate(passes=5, n=:unigrams)
      index = rand(@sentences.length)
-      tokens = @sentences[index]
+      tikis = @sentences[index]
      used = [index] # Sentences we've already used
-      verbatim = [tokens] # Verbatim sentences to avoid reproducing
+      verbatim = [tikis] # Verbatim sentences to avoid reproducing
      0.upto(passes-1) do
-        log NLP.reconstruct(tokens) if $debug
+        varsites = {} # Map bigram start site => next tiki alternatives
        varsites = {} # Map bigram start site => next token alternatives
-        tokens.each_with_index do |token, i|
+        tikis.each_with_index do |tiki, i|
-          next_token = tokens[i+1]
+          next_tiki = tikis[i+1]
-          break if next_token.nil?
+          break if next_tiki.nil?
-          alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
+          alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
          # Filter out suffixes from previous sentences
          alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
          varsites[i] = alternatives unless alternatives.empty?
@ -67,7 +66,7 @@ module Ebooks
            start, alt = site[0], site[1].sample
            verbatim << @sentences[alt[0]]
            suffix = @sentences[alt[0]][alt[1]..-1]
-            potential = tokens[0..start+1] + suffix
+            potential = tikis[0..start+1] + suffix
            # Ensure we're not just rebuilding some segment of another sentence
            unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
@ -80,10 +79,10 @@ module Ebooks
          break if variant
        end
-        tokens = variant if variant
+        tikis = variant if variant
      end
-      tokens
+      tikis
    end
  end
 end
--- a/spec/model_spec.rb
+++ b/spec/model_spec.rb
@ -5,6 +5,23 @@ require 'tempfile'
 def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
 describe Ebooks::Model do
  describe 'making tweets' do
    before(:all) { @model = Ebooks::Model.consume(path("data/0xabad1dea.json")) }
    it "generates a tweet" do
      s = @model.make_statement
      expect(s.length).to be <= 140
      puts s
    end
    it "generates an appropriate response" do
      s = @model.make_response("hi")
      expect(s.length).to be <= 140
      expect(s.downcase).to include("hi")
      puts s
    end
  end
  it "does not use a ridiculous amount of memory" do
    report = MemoryUsage.report do
      model = Ebooks::Model.consume(path("data/0xabad1dea.json"))