Retry limit and mention separation
This commit is contained in:
parent
c1d91d1693
commit
61c5caee4d
5 changed files with 65 additions and 21 deletions
|
@ -1,7 +1,7 @@
|
||||||
PATH
|
PATH
|
||||||
remote: .
|
remote: .
|
||||||
specs:
|
specs:
|
||||||
twitter_ebooks (2.0.7)
|
twitter_ebooks (2.1.0)
|
||||||
engtagger
|
engtagger
|
||||||
fast-stemmer
|
fast-stemmer
|
||||||
gingerice
|
gingerice
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
require 'twitter_ebooks'
|
require 'twitter_ebooks'
|
||||||
|
|
||||||
|
$debug = true
|
||||||
|
|
||||||
module Ebooks
|
module Ebooks
|
||||||
APP_PATH = Dir.pwd # XXX do some recursive thing instead
|
APP_PATH = Dir.pwd # XXX do some recursive thing instead
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
gem 'minitest'
|
gem 'minitest'
|
||||||
|
|
||||||
|
$debug = false
|
||||||
|
|
||||||
def log(*args)
|
def log(*args)
|
||||||
STDERR.puts args.map(&:to_s).join(' ')
|
STDERR.puts args.map(&:to_s).join(' ')
|
||||||
STDERR.flush
|
STDERR.flush
|
||||||
|
|
|
@ -7,7 +7,7 @@ require 'digest/md5'
|
||||||
|
|
||||||
module Ebooks
|
module Ebooks
|
||||||
class Model
|
class Model
|
||||||
attr_accessor :hash, :sentences, :generator, :keywords
|
attr_accessor :hash, :sentences, :mentions, :keywords
|
||||||
|
|
||||||
def self.consume(txtpath)
|
def self.consume(txtpath)
|
||||||
Model.new.consume(txtpath)
|
Model.new.consume(txtpath)
|
||||||
|
@ -22,23 +22,44 @@ module Ebooks
|
||||||
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
||||||
|
|
||||||
text = File.read(txtpath)
|
text = File.read(txtpath)
|
||||||
log "Removing commented lines and mention tokens"
|
log "Removing commented lines and sorting mentions"
|
||||||
|
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
keeping = []
|
keeping = []
|
||||||
|
mentions = []
|
||||||
lines.each do |l|
|
lines.each do |l|
|
||||||
next if l.start_with?('#') || l.include?('RT')
|
next if l.start_with?('#') # Remove commented lines
|
||||||
processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
|
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
||||||
keeping << processed.join(' ')
|
|
||||||
|
if l.include?('@')
|
||||||
|
mentions << l
|
||||||
|
else
|
||||||
|
keeping << l
|
||||||
|
end
|
||||||
end
|
end
|
||||||
text = NLP.normalize(keeping.join("\n"))
|
text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
|
||||||
|
mention_text = NLP.normalize(mentions.join("\n"))
|
||||||
|
|
||||||
log "Segmenting text into sentences"
|
log "Segmenting text into sentences"
|
||||||
|
|
||||||
sentences = NLP.sentences(text)
|
statements = NLP.sentences(text)
|
||||||
|
mentions = NLP.sentences(mention_text)
|
||||||
|
|
||||||
log "Tokenizing #{sentences.length} sentences"
|
log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
|
||||||
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
|
@sentences = []
|
||||||
|
@mentions = []
|
||||||
|
|
||||||
|
statements.each do |s|
|
||||||
|
@sentences << NLP.tokenize(s).reject do |t|
|
||||||
|
t.start_with?('@') || t.start_with?('http')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
mentions.each do |s|
|
||||||
|
@mentions << NLP.tokenize(s).reject do |t|
|
||||||
|
t.start_with?('@') || t.start_with?('http')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
log "Ranking keywords"
|
log "Ranking keywords"
|
||||||
@keywords = NLP.keywords(@sentences)
|
@keywords = NLP.keywords(@sentences)
|
||||||
|
@ -72,38 +93,55 @@ module Ebooks
|
||||||
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
||||||
end
|
end
|
||||||
|
|
||||||
def make_statement(limit=140, generator=nil)
|
def make_statement(limit=140, generator=nil, retry_limit=10)
|
||||||
responding = !generator.nil?
|
responding = !generator.nil?
|
||||||
generator ||= SuffixGenerator.build(@sentences)
|
generator ||= SuffixGenerator.build(@sentences)
|
||||||
|
|
||||||
|
retries = 0
|
||||||
tweet = ""
|
tweet = ""
|
||||||
|
|
||||||
while (tokens = generator.generate(3, :bigrams)) do
|
while (tokens = generator.generate(3, :bigrams)) do
|
||||||
next if tokens.length <= 3 && !responding
|
next if tokens.length <= 3 && !responding
|
||||||
break if valid_tweet?(tokens, limit)
|
break if valid_tweet?(tokens, limit)
|
||||||
|
|
||||||
|
retries += 1
|
||||||
|
break if retries >= retry_limit
|
||||||
end
|
end
|
||||||
|
|
||||||
if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
||||||
while (tokens = generator.generate(3, :unigrams)) do
|
while (tokens = generator.generate(3, :unigrams)) do
|
||||||
break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
|
break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
|
||||||
|
|
||||||
|
retries += 1
|
||||||
|
break if retries >= retry_limit
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
tweet = NLP.reconstruct(tokens)
|
tweet = NLP.reconstruct(tokens)
|
||||||
|
|
||||||
|
if retries >= retry_limit
|
||||||
|
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
||||||
|
end
|
||||||
|
|
||||||
fix tweet
|
fix tweet
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Test if a sentence has been copied verbatim from original
|
||||||
|
def verbatim?(tokens)
|
||||||
|
@sentences.include?(tokens) || @mentions.include?(tokens)
|
||||||
|
end
|
||||||
|
|
||||||
# Finds all relevant tokenized sentences to given input by
|
# Finds all relevant tokenized sentences to given input by
|
||||||
# comparing non-stopword token overlaps
|
# comparing non-stopword token overlaps
|
||||||
def relevant_sentences(input)
|
def find_relevant(sentences, input)
|
||||||
relevant = []
|
relevant = []
|
||||||
slightly_relevant = []
|
slightly_relevant = []
|
||||||
|
|
||||||
tokenized = NLP.tokenize(input)
|
tokenized = NLP.tokenize(input).map(&:downcase)
|
||||||
|
|
||||||
@sentences.each do |sent|
|
sentences.each do |sent|
|
||||||
tokenized.each do |token|
|
tokenized.each do |token|
|
||||||
if sent.include?(token)
|
if sent.map(&:downcase).include?(token)
|
||||||
relevant << sent unless NLP.stopword?(token)
|
relevant << sent unless NLP.stopword?(token)
|
||||||
slightly_relevant << sent
|
slightly_relevant << sent
|
||||||
end
|
end
|
||||||
|
@ -115,9 +153,9 @@ module Ebooks
|
||||||
|
|
||||||
# Generates a response by looking for related sentences
|
# Generates a response by looking for related sentences
|
||||||
# in the corpus and building a smaller generator from these
|
# in the corpus and building a smaller generator from these
|
||||||
def make_response(input, limit=140)
|
def make_response(input, limit=140, sentences=@mentions)
|
||||||
# First try
|
# Prefer mentions
|
||||||
relevant, slightly_relevant = relevant_sentences(input)
|
relevant, slightly_relevant = find_relevant(sentences, input)
|
||||||
|
|
||||||
if relevant.length >= 3
|
if relevant.length >= 3
|
||||||
generator = SuffixGenerator.build(relevant)
|
generator = SuffixGenerator.build(relevant)
|
||||||
|
@ -125,6 +163,8 @@ module Ebooks
|
||||||
elsif slightly_relevant.length >= 5
|
elsif slightly_relevant.length >= 5
|
||||||
generator = SuffixGenerator.build(slightly_relevant)
|
generator = SuffixGenerator.build(slightly_relevant)
|
||||||
make_statement(limit, generator)
|
make_statement(limit, generator)
|
||||||
|
elsif sentences.equal?(@mentions)
|
||||||
|
make_response(input, limit, @sentences)
|
||||||
else
|
else
|
||||||
make_statement(limit)
|
make_statement(limit)
|
||||||
end
|
end
|
||||||
|
|
|
@ -44,7 +44,7 @@ module Ebooks
|
||||||
verbatim = [tokens] # Verbatim sentences to avoid reproducing
|
verbatim = [tokens] # Verbatim sentences to avoid reproducing
|
||||||
|
|
||||||
0.upto(passes-1) do
|
0.upto(passes-1) do
|
||||||
puts NLP.reconstruct(tokens)
|
log NLP.reconstruct(tokens) if $debug
|
||||||
varsites = {} # Map bigram start site => next token alternatives
|
varsites = {} # Map bigram start site => next token alternatives
|
||||||
|
|
||||||
tokens.each_with_index do |token, i|
|
tokens.each_with_index do |token, i|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue