From 3b1d6f856d092412dfe724c2dcc9d5036d0c1cc6 Mon Sep 17 00:00:00 2001 From: Jaiden Mispy <^_^@mispy.me> Date: Fri, 24 Oct 2014 09:55:49 -0700 Subject: [PATCH] Switch to using token indexes instead of strings --- lib/twitter_ebooks/model.rb | 53 ++++++++++++++++++++++-------------- lib/twitter_ebooks/nlp.rb | 11 ++++---- lib/twitter_ebooks/suffix.rb | 47 ++++++++++++++++---------------- spec/model_spec.rb | 17 ++++++++++++ 4 files changed, 79 insertions(+), 49 deletions(-) diff --git a/lib/twitter_ebooks/model.rb b/lib/twitter_ebooks/model.rb index 03f4fe5..33af43d 100644 --- a/lib/twitter_ebooks/model.rb +++ b/lib/twitter_ebooks/model.rb @@ -18,18 +18,31 @@ module Ebooks Marshal.load(File.open(path, 'rb') { |f| f.read }) end - def mass_tokenize(text) - sentences = NLP.sentences(text) - tokens = [] + def initialize + # This is the only source of actual strings in the model. It is + # an array of unique tokens. Manipulation of a token is mostly done + # using its index in this array, which we call a "tiki" + @tokens = [] - sentences.each do |s| - tokens << NLP.tokenize(s).reject do |t| + # Reverse lookup tiki by token, for faster generation + @tikis = {} + end + + def tikify(token) + @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1) + end + + def mass_tikify(text) + sentences = NLP.sentences(text) + + sentences.map do |s| + tokens = NLP.tokenize(s).reject do |t| # Don't include usernames/urls as tokens t.include?('@') || t.include?('http') end - end - tokens + tokens.map { |t| tikify(t) } + end end def consume(path) @@ -76,11 +89,11 @@ module Ebooks log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" - @sentences = mass_tokenize(text) - @mentions = mass_tokenize(mention_text) + @sentences = mass_tikify(text) + @mentions = mass_tikify(mention_text) log "Ranking keywords" - @keywords = NLP.keywords(@sentences) + @keywords = NLP.keywords(text) self end @@ -106,8 +119,8 @@ module Ebooks NLP.htmlentities.decode tweet end - def valid_tweet?(tokens, limit) - tweet = NLP.reconstruct(tokens) + def valid_tweet?(tikis, limit) + tweet = NLP.reconstruct(tikis, @tokens) tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) end @@ -118,24 +131,24 @@ module Ebooks retries = 0 tweet = "" - while (tokens = generator.generate(3, :bigrams)) do - next if tokens.length <= 3 && !responding - break if valid_tweet?(tokens, limit) + while (tikis = generator.generate(3, :bigrams)) do + next if tikis.length <= 3 && !responding + break if valid_tweet?(tikis, limit) retries += 1 break if retries >= retry_limit end - if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident - while (tokens = generator.generate(3, :unigrams)) do - break if valid_tweet?(tokens, limit) && !verbatim?(tokens) + if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident + while (tikis = generator.generate(3, :unigrams)) do + break if valid_tweet?(tikis, limit) && !verbatim?(tikis) retries += 1 break if retries >= retry_limit end end - tweet = NLP.reconstruct(tokens) + tweet = NLP.reconstruct(tikis, @tokens) if retries >= retry_limit log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\"" @@ -159,7 +172,7 @@ module Ebooks sentences.each do |sent| tokenized.each do |token| - if sent.map(&:downcase).include?(token) + if sent.map { |tiki| @tokens[tiki].downcase }.include?(token) relevant << sent unless NLP.stopword?(token) slightly_relevant << sent end diff --git a/lib/twitter_ebooks/nlp.rb b/lib/twitter_ebooks/nlp.rb index 11ad8a5..819ee69 100644 --- a/lib/twitter_ebooks/nlp.rb +++ b/lib/twitter_ebooks/nlp.rb @@ -69,9 +69,9 @@ module Ebooks Stemmer::stem_word(word.downcase) end - def self.keywords(sentences) + def self.keywords(text) # Preprocess to remove stopwords (highscore's blacklist is v. slow) - text = sentences.flatten.reject { |t| stopword?(t) }.join(' ') + text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ') text = Highscore::Content.new(text) @@ -91,11 +91,12 @@ module Ebooks end # Takes a list of tokens and builds a nice-looking sentence - def self.reconstruct(tokens) + def self.reconstruct(tikis, tokens) text = "" last_token = nil - tokens.each do |token| - next if token == INTERIM + tikis.each do |tiki| + next if tiki == INTERIM + token = tokens[tiki] text += ' ' if last_token && space_between?(last_token, token) text += token last_token = token diff --git a/lib/twitter_ebooks/suffix.rb b/lib/twitter_ebooks/suffix.rb index 0b10f64..6e09d7e 100644 --- a/lib/twitter_ebooks/suffix.rb +++ b/lib/twitter_ebooks/suffix.rb @@ -15,24 +15,24 @@ module Ebooks @unigrams = {} @bigrams = {} - @sentences.each_with_index do |tokens, i| - last_token = INTERIM - tokens.each_with_index do |token, j| - @unigrams[last_token] ||= [] - @unigrams[last_token] << [i, j] + @sentences.each_with_index do |tikis, i| + last_tiki = INTERIM + tikis.each_with_index do |tiki, j| + @unigrams[last_tiki] ||= [] + @unigrams[last_tiki] << [i, j] - @bigrams[last_token] ||= {} - @bigrams[last_token][token] ||= [] + @bigrams[last_tiki] ||= {} + @bigrams[last_tiki][tiki] ||= [] - if j == tokens.length-1 # Mark sentence endings - @unigrams[token] ||= [] - @unigrams[token] << [i, INTERIM] - @bigrams[last_token][token] << [i, INTERIM] + if j == tikis.length-1 # Mark sentence endings + @unigrams[tiki] ||= [] + @unigrams[tiki] << [i, INTERIM] + @bigrams[last_tiki][tiki] << [i, INTERIM] else - @bigrams[last_token][token] << [i, j+1] + @bigrams[last_tiki][tiki] << [i, j+1] end - last_token = token + last_tiki = tiki end end @@ -41,19 +41,18 @@ module Ebooks def generate(passes=5, n=:unigrams) index = rand(@sentences.length) - tokens = @sentences[index] + tikis = @sentences[index] used = [index] # Sentences we've already used - verbatim = [tokens] # Verbatim sentences to avoid reproducing + verbatim = [tikis] # Verbatim sentences to avoid reproducing 0.upto(passes-1) do - log NLP.reconstruct(tokens) if $debug - varsites = {} # Map bigram start site => next token alternatives + varsites = {} # Map bigram start site => next tiki alternatives - tokens.each_with_index do |token, i| - next_token = tokens[i+1] - break if next_token.nil? + tikis.each_with_index do |tiki, i| + next_tiki = tikis[i+1] + break if next_tiki.nil? - alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token] + alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki] # Filter out suffixes from previous sentences alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) } varsites[i] = alternatives unless alternatives.empty? @@ -67,7 +66,7 @@ module Ebooks start, alt = site[0], site[1].sample verbatim << @sentences[alt[0]] suffix = @sentences[alt[0]][alt[1]..-1] - potential = tokens[0..start+1] + suffix + potential = tikis[0..start+1] + suffix # Ensure we're not just rebuilding some segment of another sentence unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) } @@ -80,10 +79,10 @@ module Ebooks break if variant end - tokens = variant if variant + tikis = variant if variant end - tokens + tikis end end end diff --git a/spec/model_spec.rb b/spec/model_spec.rb index 4837735..c20b80d 100644 --- a/spec/model_spec.rb +++ b/spec/model_spec.rb @@ -5,6 +5,23 @@ require 'tempfile' def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end describe Ebooks::Model do + describe 'making tweets' do + before(:all) { @model = Ebooks::Model.consume(path("data/0xabad1dea.json")) } + + it "generates a tweet" do + s = @model.make_statement + expect(s.length).to be <= 140 + puts s + end + + it "generates an appropriate response" do + s = @model.make_response("hi") + expect(s.length).to be <= 140 + expect(s.downcase).to include("hi") + puts s + end + end + it "does not use a ridiculous amount of memory" do report = MemoryUsage.report do model = Ebooks::Model.consume(path("data/0xabad1dea.json"))