Retry limit and mention separation

This commit is contained in:
Mispy 2013-11-18 02:59:15 -08:00
parent c1d91d1693
commit 61c5caee4d
5 changed files with 65 additions and 21 deletions

View file

@ -1,7 +1,7 @@
PATH PATH
remote: . remote: .
specs: specs:
twitter_ebooks (2.0.7) twitter_ebooks (2.1.0)
engtagger engtagger
fast-stemmer fast-stemmer
gingerice gingerice

View file

@ -2,6 +2,8 @@
require 'twitter_ebooks' require 'twitter_ebooks'
$debug = true
module Ebooks module Ebooks
APP_PATH = Dir.pwd # XXX do some recursive thing instead APP_PATH = Dir.pwd # XXX do some recursive thing instead

View file

@ -1,5 +1,7 @@
gem 'minitest' gem 'minitest'
$debug = false
def log(*args) def log(*args)
STDERR.puts args.map(&:to_s).join(' ') STDERR.puts args.map(&:to_s).join(' ')
STDERR.flush STDERR.flush

View file

@ -7,7 +7,7 @@ require 'digest/md5'
module Ebooks module Ebooks
class Model class Model
attr_accessor :hash, :sentences, :generator, :keywords attr_accessor :hash, :sentences, :mentions, :keywords
def self.consume(txtpath) def self.consume(txtpath)
Model.new.consume(txtpath) Model.new.consume(txtpath)
@ -22,23 +22,44 @@ module Ebooks
@hash = Digest::MD5.hexdigest(File.read(txtpath)) @hash = Digest::MD5.hexdigest(File.read(txtpath))
text = File.read(txtpath) text = File.read(txtpath)
log "Removing commented lines and mention tokens" log "Removing commented lines and sorting mentions"
lines = text.split("\n") lines = text.split("\n")
keeping = [] keeping = []
mentions = []
lines.each do |l| lines.each do |l|
next if l.start_with?('#') || l.include?('RT') next if l.start_with?('#') # Remove commented lines
processed = l.split.reject { |w| w.include?('@') || w.include?('http') } next if l.include?('RT') || l.include?('MT') # Remove soft retweets
keeping << processed.join(' ')
if l.include?('@')
mentions << l
else
keeping << l
end
end end
text = NLP.normalize(keeping.join("\n")) text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
mention_text = NLP.normalize(mentions.join("\n"))
log "Segmenting text into sentences" log "Segmenting text into sentences"
sentences = NLP.sentences(text) statements = NLP.sentences(text)
mentions = NLP.sentences(mention_text)
log "Tokenizing #{sentences.length} sentences" log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
@sentences = sentences.map { |sent| NLP.tokenize(sent) } @sentences = []
@mentions = []
statements.each do |s|
@sentences << NLP.tokenize(s).reject do |t|
t.start_with?('@') || t.start_with?('http')
end
end
mentions.each do |s|
@mentions << NLP.tokenize(s).reject do |t|
t.start_with?('@') || t.start_with?('http')
end
end
log "Ranking keywords" log "Ranking keywords"
@keywords = NLP.keywords(@sentences) @keywords = NLP.keywords(@sentences)
@ -72,38 +93,55 @@ module Ebooks
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end end
def make_statement(limit=140, generator=nil) def make_statement(limit=140, generator=nil, retry_limit=10)
responding = !generator.nil? responding = !generator.nil?
generator ||= SuffixGenerator.build(@sentences) generator ||= SuffixGenerator.build(@sentences)
retries = 0
tweet = "" tweet = ""
while (tokens = generator.generate(3, :bigrams)) do while (tokens = generator.generate(3, :bigrams)) do
next if tokens.length <= 3 && !responding next if tokens.length <= 3 && !responding
break if valid_tweet?(tokens, limit) break if valid_tweet?(tokens, limit)
retries += 1
break if retries >= retry_limit
end end
if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
while (tokens = generator.generate(3, :unigrams)) do while (tokens = generator.generate(3, :unigrams)) do
break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens) break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
retries += 1
break if retries >= retry_limit
end end
end end
tweet = NLP.reconstruct(tokens) tweet = NLP.reconstruct(tokens)
if retries >= retry_limit
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
end
fix tweet fix tweet
end end
# Test if a sentence has been copied verbatim from original
def verbatim?(tokens)
@sentences.include?(tokens) || @mentions.include?(tokens)
end
# Finds all relevant tokenized sentences to given input by # Finds all relevant tokenized sentences to given input by
# comparing non-stopword token overlaps # comparing non-stopword token overlaps
def relevant_sentences(input) def find_relevant(sentences, input)
relevant = [] relevant = []
slightly_relevant = [] slightly_relevant = []
tokenized = NLP.tokenize(input) tokenized = NLP.tokenize(input).map(&:downcase)
@sentences.each do |sent| sentences.each do |sent|
tokenized.each do |token| tokenized.each do |token|
if sent.include?(token) if sent.map(&:downcase).include?(token)
relevant << sent unless NLP.stopword?(token) relevant << sent unless NLP.stopword?(token)
slightly_relevant << sent slightly_relevant << sent
end end
@ -115,9 +153,9 @@ module Ebooks
# Generates a response by looking for related sentences # Generates a response by looking for related sentences
# in the corpus and building a smaller generator from these # in the corpus and building a smaller generator from these
def make_response(input, limit=140) def make_response(input, limit=140, sentences=@mentions)
# First try # Prefer mentions
relevant, slightly_relevant = relevant_sentences(input) relevant, slightly_relevant = find_relevant(sentences, input)
if relevant.length >= 3 if relevant.length >= 3
generator = SuffixGenerator.build(relevant) generator = SuffixGenerator.build(relevant)
@ -125,6 +163,8 @@ module Ebooks
elsif slightly_relevant.length >= 5 elsif slightly_relevant.length >= 5
generator = SuffixGenerator.build(slightly_relevant) generator = SuffixGenerator.build(slightly_relevant)
make_statement(limit, generator) make_statement(limit, generator)
elsif sentences.equal?(@mentions)
make_response(input, limit, @sentences)
else else
make_statement(limit) make_statement(limit)
end end

View file

@ -44,7 +44,7 @@ module Ebooks
verbatim = [tokens] # Verbatim sentences to avoid reproducing verbatim = [tokens] # Verbatim sentences to avoid reproducing
0.upto(passes-1) do 0.upto(passes-1) do
puts NLP.reconstruct(tokens) log NLP.reconstruct(tokens) if $debug
varsites = {} # Map bigram start site => next token alternatives varsites = {} # Map bigram start site => next token alternatives
tokens.each_with_index do |token, i| tokens.each_with_index do |token, i|