Switch to using token indexes instead of strings
This commit is contained in:
parent
6ae1dd5dac
commit
3b1d6f856d
4 changed files with 79 additions and 49 deletions
|
@ -18,18 +18,31 @@ module Ebooks
|
||||||
Marshal.load(File.open(path, 'rb') { |f| f.read })
|
Marshal.load(File.open(path, 'rb') { |f| f.read })
|
||||||
end
|
end
|
||||||
|
|
||||||
def mass_tokenize(text)
|
def initialize
|
||||||
sentences = NLP.sentences(text)
|
# This is the only source of actual strings in the model. It is
|
||||||
tokens = []
|
# an array of unique tokens. Manipulation of a token is mostly done
|
||||||
|
# using its index in this array, which we call a "tiki"
|
||||||
|
@tokens = []
|
||||||
|
|
||||||
sentences.each do |s|
|
# Reverse lookup tiki by token, for faster generation
|
||||||
tokens << NLP.tokenize(s).reject do |t|
|
@tikis = {}
|
||||||
|
end
|
||||||
|
|
||||||
|
def tikify(token)
|
||||||
|
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
|
||||||
|
end
|
||||||
|
|
||||||
|
def mass_tikify(text)
|
||||||
|
sentences = NLP.sentences(text)
|
||||||
|
|
||||||
|
sentences.map do |s|
|
||||||
|
tokens = NLP.tokenize(s).reject do |t|
|
||||||
# Don't include usernames/urls as tokens
|
# Don't include usernames/urls as tokens
|
||||||
t.include?('@') || t.include?('http')
|
t.include?('@') || t.include?('http')
|
||||||
end
|
end
|
||||||
end
|
|
||||||
|
|
||||||
tokens
|
tokens.map { |t| tikify(t) }
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def consume(path)
|
def consume(path)
|
||||||
|
@ -76,11 +89,11 @@ module Ebooks
|
||||||
|
|
||||||
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
|
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
|
||||||
|
|
||||||
@sentences = mass_tokenize(text)
|
@sentences = mass_tikify(text)
|
||||||
@mentions = mass_tokenize(mention_text)
|
@mentions = mass_tikify(mention_text)
|
||||||
|
|
||||||
log "Ranking keywords"
|
log "Ranking keywords"
|
||||||
@keywords = NLP.keywords(@sentences)
|
@keywords = NLP.keywords(text)
|
||||||
|
|
||||||
self
|
self
|
||||||
end
|
end
|
||||||
|
@ -106,8 +119,8 @@ module Ebooks
|
||||||
NLP.htmlentities.decode tweet
|
NLP.htmlentities.decode tweet
|
||||||
end
|
end
|
||||||
|
|
||||||
def valid_tweet?(tokens, limit)
|
def valid_tweet?(tikis, limit)
|
||||||
tweet = NLP.reconstruct(tokens)
|
tweet = NLP.reconstruct(tikis, @tokens)
|
||||||
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -118,24 +131,24 @@ module Ebooks
|
||||||
retries = 0
|
retries = 0
|
||||||
tweet = ""
|
tweet = ""
|
||||||
|
|
||||||
while (tokens = generator.generate(3, :bigrams)) do
|
while (tikis = generator.generate(3, :bigrams)) do
|
||||||
next if tokens.length <= 3 && !responding
|
next if tikis.length <= 3 && !responding
|
||||||
break if valid_tweet?(tokens, limit)
|
break if valid_tweet?(tikis, limit)
|
||||||
|
|
||||||
retries += 1
|
retries += 1
|
||||||
break if retries >= retry_limit
|
break if retries >= retry_limit
|
||||||
end
|
end
|
||||||
|
|
||||||
if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
|
||||||
while (tokens = generator.generate(3, :unigrams)) do
|
while (tikis = generator.generate(3, :unigrams)) do
|
||||||
break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
|
break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
|
||||||
|
|
||||||
retries += 1
|
retries += 1
|
||||||
break if retries >= retry_limit
|
break if retries >= retry_limit
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
tweet = NLP.reconstruct(tokens)
|
tweet = NLP.reconstruct(tikis, @tokens)
|
||||||
|
|
||||||
if retries >= retry_limit
|
if retries >= retry_limit
|
||||||
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
||||||
|
@ -159,7 +172,7 @@ module Ebooks
|
||||||
|
|
||||||
sentences.each do |sent|
|
sentences.each do |sent|
|
||||||
tokenized.each do |token|
|
tokenized.each do |token|
|
||||||
if sent.map(&:downcase).include?(token)
|
if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
|
||||||
relevant << sent unless NLP.stopword?(token)
|
relevant << sent unless NLP.stopword?(token)
|
||||||
slightly_relevant << sent
|
slightly_relevant << sent
|
||||||
end
|
end
|
||||||
|
|
|
@ -69,9 +69,9 @@ module Ebooks
|
||||||
Stemmer::stem_word(word.downcase)
|
Stemmer::stem_word(word.downcase)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.keywords(sentences)
|
def self.keywords(text)
|
||||||
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
||||||
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
|
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
||||||
|
|
||||||
text = Highscore::Content.new(text)
|
text = Highscore::Content.new(text)
|
||||||
|
|
||||||
|
@ -91,11 +91,12 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
# Takes a list of tokens and builds a nice-looking sentence
|
# Takes a list of tokens and builds a nice-looking sentence
|
||||||
def self.reconstruct(tokens)
|
def self.reconstruct(tikis, tokens)
|
||||||
text = ""
|
text = ""
|
||||||
last_token = nil
|
last_token = nil
|
||||||
tokens.each do |token|
|
tikis.each do |tiki|
|
||||||
next if token == INTERIM
|
next if tiki == INTERIM
|
||||||
|
token = tokens[tiki]
|
||||||
text += ' ' if last_token && space_between?(last_token, token)
|
text += ' ' if last_token && space_between?(last_token, token)
|
||||||
text += token
|
text += token
|
||||||
last_token = token
|
last_token = token
|
||||||
|
|
|
@ -15,24 +15,24 @@ module Ebooks
|
||||||
@unigrams = {}
|
@unigrams = {}
|
||||||
@bigrams = {}
|
@bigrams = {}
|
||||||
|
|
||||||
@sentences.each_with_index do |tokens, i|
|
@sentences.each_with_index do |tikis, i|
|
||||||
last_token = INTERIM
|
last_tiki = INTERIM
|
||||||
tokens.each_with_index do |token, j|
|
tikis.each_with_index do |tiki, j|
|
||||||
@unigrams[last_token] ||= []
|
@unigrams[last_tiki] ||= []
|
||||||
@unigrams[last_token] << [i, j]
|
@unigrams[last_tiki] << [i, j]
|
||||||
|
|
||||||
@bigrams[last_token] ||= {}
|
@bigrams[last_tiki] ||= {}
|
||||||
@bigrams[last_token][token] ||= []
|
@bigrams[last_tiki][tiki] ||= []
|
||||||
|
|
||||||
if j == tokens.length-1 # Mark sentence endings
|
if j == tikis.length-1 # Mark sentence endings
|
||||||
@unigrams[token] ||= []
|
@unigrams[tiki] ||= []
|
||||||
@unigrams[token] << [i, INTERIM]
|
@unigrams[tiki] << [i, INTERIM]
|
||||||
@bigrams[last_token][token] << [i, INTERIM]
|
@bigrams[last_tiki][tiki] << [i, INTERIM]
|
||||||
else
|
else
|
||||||
@bigrams[last_token][token] << [i, j+1]
|
@bigrams[last_tiki][tiki] << [i, j+1]
|
||||||
end
|
end
|
||||||
|
|
||||||
last_token = token
|
last_tiki = tiki
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -41,19 +41,18 @@ module Ebooks
|
||||||
|
|
||||||
def generate(passes=5, n=:unigrams)
|
def generate(passes=5, n=:unigrams)
|
||||||
index = rand(@sentences.length)
|
index = rand(@sentences.length)
|
||||||
tokens = @sentences[index]
|
tikis = @sentences[index]
|
||||||
used = [index] # Sentences we've already used
|
used = [index] # Sentences we've already used
|
||||||
verbatim = [tokens] # Verbatim sentences to avoid reproducing
|
verbatim = [tikis] # Verbatim sentences to avoid reproducing
|
||||||
|
|
||||||
0.upto(passes-1) do
|
0.upto(passes-1) do
|
||||||
log NLP.reconstruct(tokens) if $debug
|
varsites = {} # Map bigram start site => next tiki alternatives
|
||||||
varsites = {} # Map bigram start site => next token alternatives
|
|
||||||
|
|
||||||
tokens.each_with_index do |token, i|
|
tikis.each_with_index do |tiki, i|
|
||||||
next_token = tokens[i+1]
|
next_tiki = tikis[i+1]
|
||||||
break if next_token.nil?
|
break if next_tiki.nil?
|
||||||
|
|
||||||
alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
|
alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
|
||||||
# Filter out suffixes from previous sentences
|
# Filter out suffixes from previous sentences
|
||||||
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
||||||
varsites[i] = alternatives unless alternatives.empty?
|
varsites[i] = alternatives unless alternatives.empty?
|
||||||
|
@ -67,7 +66,7 @@ module Ebooks
|
||||||
start, alt = site[0], site[1].sample
|
start, alt = site[0], site[1].sample
|
||||||
verbatim << @sentences[alt[0]]
|
verbatim << @sentences[alt[0]]
|
||||||
suffix = @sentences[alt[0]][alt[1]..-1]
|
suffix = @sentences[alt[0]][alt[1]..-1]
|
||||||
potential = tokens[0..start+1] + suffix
|
potential = tikis[0..start+1] + suffix
|
||||||
|
|
||||||
# Ensure we're not just rebuilding some segment of another sentence
|
# Ensure we're not just rebuilding some segment of another sentence
|
||||||
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
||||||
|
@ -80,10 +79,10 @@ module Ebooks
|
||||||
break if variant
|
break if variant
|
||||||
end
|
end
|
||||||
|
|
||||||
tokens = variant if variant
|
tikis = variant if variant
|
||||||
end
|
end
|
||||||
|
|
||||||
tokens
|
tikis
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -5,6 +5,23 @@ require 'tempfile'
|
||||||
def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
|
def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
|
||||||
|
|
||||||
describe Ebooks::Model do
|
describe Ebooks::Model do
|
||||||
|
describe 'making tweets' do
|
||||||
|
before(:all) { @model = Ebooks::Model.consume(path("data/0xabad1dea.json")) }
|
||||||
|
|
||||||
|
it "generates a tweet" do
|
||||||
|
s = @model.make_statement
|
||||||
|
expect(s.length).to be <= 140
|
||||||
|
puts s
|
||||||
|
end
|
||||||
|
|
||||||
|
it "generates an appropriate response" do
|
||||||
|
s = @model.make_response("hi")
|
||||||
|
expect(s.length).to be <= 140
|
||||||
|
expect(s.downcase).to include("hi")
|
||||||
|
puts s
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
it "does not use a ridiculous amount of memory" do
|
it "does not use a ridiculous amount of memory" do
|
||||||
report = MemoryUsage.report do
|
report = MemoryUsage.report do
|
||||||
model = Ebooks::Model.consume(path("data/0xabad1dea.json"))
|
model = Ebooks::Model.consume(path("data/0xabad1dea.json"))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue