Switch to using token indexes instead of strings

This commit is contained in:
Jaiden Mispy 2014-10-24 09:55:49 -07:00
parent 6ae1dd5dac
commit 3b1d6f856d
4 changed files with 79 additions and 49 deletions

View file

@ -18,18 +18,31 @@ module Ebooks
Marshal.load(File.open(path, 'rb') { |f| f.read }) Marshal.load(File.open(path, 'rb') { |f| f.read })
end end
def mass_tokenize(text) def initialize
sentences = NLP.sentences(text) # This is the only source of actual strings in the model. It is
tokens = [] # an array of unique tokens. Manipulation of a token is mostly done
# using its index in this array, which we call a "tiki"
@tokens = []
sentences.each do |s| # Reverse lookup tiki by token, for faster generation
tokens << NLP.tokenize(s).reject do |t| @tikis = {}
end
def tikify(token)
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
end
def mass_tikify(text)
sentences = NLP.sentences(text)
sentences.map do |s|
tokens = NLP.tokenize(s).reject do |t|
# Don't include usernames/urls as tokens # Don't include usernames/urls as tokens
t.include?('@') || t.include?('http') t.include?('@') || t.include?('http')
end end
end
tokens tokens.map { |t| tikify(t) }
end
end end
def consume(path) def consume(path)
@ -76,11 +89,11 @@ module Ebooks
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
@sentences = mass_tokenize(text) @sentences = mass_tikify(text)
@mentions = mass_tokenize(mention_text) @mentions = mass_tikify(mention_text)
log "Ranking keywords" log "Ranking keywords"
@keywords = NLP.keywords(@sentences) @keywords = NLP.keywords(text)
self self
end end
@ -106,8 +119,8 @@ module Ebooks
NLP.htmlentities.decode tweet NLP.htmlentities.decode tweet
end end
def valid_tweet?(tokens, limit) def valid_tweet?(tikis, limit)
tweet = NLP.reconstruct(tokens) tweet = NLP.reconstruct(tikis, @tokens)
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end end
@ -118,24 +131,24 @@ module Ebooks
retries = 0 retries = 0
tweet = "" tweet = ""
while (tokens = generator.generate(3, :bigrams)) do while (tikis = generator.generate(3, :bigrams)) do
next if tokens.length <= 3 && !responding next if tikis.length <= 3 && !responding
break if valid_tweet?(tokens, limit) break if valid_tweet?(tikis, limit)
retries += 1 retries += 1
break if retries >= retry_limit break if retries >= retry_limit
end end
if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
while (tokens = generator.generate(3, :unigrams)) do while (tikis = generator.generate(3, :unigrams)) do
break if valid_tweet?(tokens, limit) && !verbatim?(tokens) break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
retries += 1 retries += 1
break if retries >= retry_limit break if retries >= retry_limit
end end
end end
tweet = NLP.reconstruct(tokens) tweet = NLP.reconstruct(tikis, @tokens)
if retries >= retry_limit if retries >= retry_limit
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\"" log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
@ -159,7 +172,7 @@ module Ebooks
sentences.each do |sent| sentences.each do |sent|
tokenized.each do |token| tokenized.each do |token|
if sent.map(&:downcase).include?(token) if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
relevant << sent unless NLP.stopword?(token) relevant << sent unless NLP.stopword?(token)
slightly_relevant << sent slightly_relevant << sent
end end

View file

@ -69,9 +69,9 @@ module Ebooks
Stemmer::stem_word(word.downcase) Stemmer::stem_word(word.downcase)
end end
def self.keywords(sentences) def self.keywords(text)
# Preprocess to remove stopwords (highscore's blacklist is v. slow) # Preprocess to remove stopwords (highscore's blacklist is v. slow)
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ') text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
text = Highscore::Content.new(text) text = Highscore::Content.new(text)
@ -91,11 +91,12 @@ module Ebooks
end end
# Takes a list of tokens and builds a nice-looking sentence # Takes a list of tokens and builds a nice-looking sentence
def self.reconstruct(tokens) def self.reconstruct(tikis, tokens)
text = "" text = ""
last_token = nil last_token = nil
tokens.each do |token| tikis.each do |tiki|
next if token == INTERIM next if tiki == INTERIM
token = tokens[tiki]
text += ' ' if last_token && space_between?(last_token, token) text += ' ' if last_token && space_between?(last_token, token)
text += token text += token
last_token = token last_token = token

View file

@ -15,24 +15,24 @@ module Ebooks
@unigrams = {} @unigrams = {}
@bigrams = {} @bigrams = {}
@sentences.each_with_index do |tokens, i| @sentences.each_with_index do |tikis, i|
last_token = INTERIM last_tiki = INTERIM
tokens.each_with_index do |token, j| tikis.each_with_index do |tiki, j|
@unigrams[last_token] ||= [] @unigrams[last_tiki] ||= []
@unigrams[last_token] << [i, j] @unigrams[last_tiki] << [i, j]
@bigrams[last_token] ||= {} @bigrams[last_tiki] ||= {}
@bigrams[last_token][token] ||= [] @bigrams[last_tiki][tiki] ||= []
if j == tokens.length-1 # Mark sentence endings if j == tikis.length-1 # Mark sentence endings
@unigrams[token] ||= [] @unigrams[tiki] ||= []
@unigrams[token] << [i, INTERIM] @unigrams[tiki] << [i, INTERIM]
@bigrams[last_token][token] << [i, INTERIM] @bigrams[last_tiki][tiki] << [i, INTERIM]
else else
@bigrams[last_token][token] << [i, j+1] @bigrams[last_tiki][tiki] << [i, j+1]
end end
last_token = token last_tiki = tiki
end end
end end
@ -41,19 +41,18 @@ module Ebooks
def generate(passes=5, n=:unigrams) def generate(passes=5, n=:unigrams)
index = rand(@sentences.length) index = rand(@sentences.length)
tokens = @sentences[index] tikis = @sentences[index]
used = [index] # Sentences we've already used used = [index] # Sentences we've already used
verbatim = [tokens] # Verbatim sentences to avoid reproducing verbatim = [tikis] # Verbatim sentences to avoid reproducing
0.upto(passes-1) do 0.upto(passes-1) do
log NLP.reconstruct(tokens) if $debug varsites = {} # Map bigram start site => next tiki alternatives
varsites = {} # Map bigram start site => next token alternatives
tokens.each_with_index do |token, i| tikis.each_with_index do |tiki, i|
next_token = tokens[i+1] next_tiki = tikis[i+1]
break if next_token.nil? break if next_tiki.nil?
alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token] alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
# Filter out suffixes from previous sentences # Filter out suffixes from previous sentences
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) } alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
varsites[i] = alternatives unless alternatives.empty? varsites[i] = alternatives unless alternatives.empty?
@ -67,7 +66,7 @@ module Ebooks
start, alt = site[0], site[1].sample start, alt = site[0], site[1].sample
verbatim << @sentences[alt[0]] verbatim << @sentences[alt[0]]
suffix = @sentences[alt[0]][alt[1]..-1] suffix = @sentences[alt[0]][alt[1]..-1]
potential = tokens[0..start+1] + suffix potential = tikis[0..start+1] + suffix
# Ensure we're not just rebuilding some segment of another sentence # Ensure we're not just rebuilding some segment of another sentence
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) } unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
@ -80,10 +79,10 @@ module Ebooks
break if variant break if variant
end end
tokens = variant if variant tikis = variant if variant
end end
tokens tikis
end end
end end
end end

View file

@ -5,6 +5,23 @@ require 'tempfile'
def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
describe Ebooks::Model do describe Ebooks::Model do
describe 'making tweets' do
before(:all) { @model = Ebooks::Model.consume(path("data/0xabad1dea.json")) }
it "generates a tweet" do
s = @model.make_statement
expect(s.length).to be <= 140
puts s
end
it "generates an appropriate response" do
s = @model.make_response("hi")
expect(s.length).to be <= 140
expect(s.downcase).to include("hi")
puts s
end
end
it "does not use a ridiculous amount of memory" do it "does not use a ridiculous amount of memory" do
report = MemoryUsage.report do report = MemoryUsage.report do
model = Ebooks::Model.consume(path("data/0xabad1dea.json")) model = Ebooks::Model.consume(path("data/0xabad1dea.json"))