From 61c5caee4dc05078d5d47b19e61b30c104d673c7 Mon Sep 17 00:00:00 2001 From: Mispy <^_^@mispy.me> Date: Mon, 18 Nov 2013 02:59:15 -0800 Subject: [PATCH] Retry limit and mention separation --- Gemfile.lock | 2 +- bin/ebooks | 2 + lib/twitter_ebooks.rb | 2 + lib/twitter_ebooks/model.rb | 78 +++++++++++++++++++++++++++--------- lib/twitter_ebooks/suffix.rb | 2 +- 5 files changed, 65 insertions(+), 21 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 3c52f77..a041c94 100755 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - twitter_ebooks (2.0.7) + twitter_ebooks (2.1.0) engtagger fast-stemmer gingerice diff --git a/bin/ebooks b/bin/ebooks index e6f845e..3baee71 100755 --- a/bin/ebooks +++ b/bin/ebooks @@ -2,6 +2,8 @@ require 'twitter_ebooks' +$debug = true + module Ebooks APP_PATH = Dir.pwd # XXX do some recursive thing instead diff --git a/lib/twitter_ebooks.rb b/lib/twitter_ebooks.rb index a41c4eb..7a47976 100755 --- a/lib/twitter_ebooks.rb +++ b/lib/twitter_ebooks.rb @@ -1,5 +1,7 @@ gem 'minitest' +$debug = false + def log(*args) STDERR.puts args.map(&:to_s).join(' ') STDERR.flush diff --git a/lib/twitter_ebooks/model.rb b/lib/twitter_ebooks/model.rb index 65f41a3..0742197 100755 --- a/lib/twitter_ebooks/model.rb +++ b/lib/twitter_ebooks/model.rb @@ -7,7 +7,7 @@ require 'digest/md5' module Ebooks class Model - attr_accessor :hash, :sentences, :generator, :keywords + attr_accessor :hash, :sentences, :mentions, :keywords def self.consume(txtpath) Model.new.consume(txtpath) @@ -22,23 +22,44 @@ module Ebooks @hash = Digest::MD5.hexdigest(File.read(txtpath)) text = File.read(txtpath) - log "Removing commented lines and mention tokens" + log "Removing commented lines and sorting mentions" lines = text.split("\n") keeping = [] + mentions = [] lines.each do |l| - next if l.start_with?('#') || l.include?('RT') - processed = l.split.reject { |w| w.include?('@') || w.include?('http') } - keeping << processed.join(' ') + next if l.start_with?('#') # Remove commented lines + next if l.include?('RT') || l.include?('MT') # Remove soft retweets + + if l.include?('@') + mentions << l + else + keeping << l + end end - text = NLP.normalize(keeping.join("\n")) + text = NLP.normalize(keeping.join("\n")) # Normalize weird characters + mention_text = NLP.normalize(mentions.join("\n")) log "Segmenting text into sentences" - sentences = NLP.sentences(text) + statements = NLP.sentences(text) + mentions = NLP.sentences(mention_text) - log "Tokenizing #{sentences.length} sentences" - @sentences = sentences.map { |sent| NLP.tokenize(sent) } + log "Tokenizing #{statements.length} statements and #{mentions.length} mentions" + @sentences = [] + @mentions = [] + + statements.each do |s| + @sentences << NLP.tokenize(s).reject do |t| + t.start_with?('@') || t.start_with?('http') + end + end + + mentions.each do |s| + @mentions << NLP.tokenize(s).reject do |t| + t.start_with?('@') || t.start_with?('http') + end + end log "Ranking keywords" @keywords = NLP.keywords(@sentences) @@ -72,38 +93,55 @@ module Ebooks tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) end - def make_statement(limit=140, generator=nil) + def make_statement(limit=140, generator=nil, retry_limit=10) responding = !generator.nil? generator ||= SuffixGenerator.build(@sentences) + + retries = 0 tweet = "" while (tokens = generator.generate(3, :bigrams)) do next if tokens.length <= 3 && !responding break if valid_tweet?(tokens, limit) + + retries += 1 + break if retries >= retry_limit end - if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident + if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident while (tokens = generator.generate(3, :unigrams)) do - break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens) + break if valid_tweet?(tokens, limit) && !verbatim?(tokens) + + retries += 1 + break if retries >= retry_limit end end tweet = NLP.reconstruct(tokens) + if retries >= retry_limit + log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\"" + end + fix tweet end + # Test if a sentence has been copied verbatim from original + def verbatim?(tokens) + @sentences.include?(tokens) || @mentions.include?(tokens) + end + # Finds all relevant tokenized sentences to given input by # comparing non-stopword token overlaps - def relevant_sentences(input) + def find_relevant(sentences, input) relevant = [] slightly_relevant = [] - tokenized = NLP.tokenize(input) + tokenized = NLP.tokenize(input).map(&:downcase) - @sentences.each do |sent| + sentences.each do |sent| tokenized.each do |token| - if sent.include?(token) + if sent.map(&:downcase).include?(token) relevant << sent unless NLP.stopword?(token) slightly_relevant << sent end @@ -115,9 +153,9 @@ module Ebooks # Generates a response by looking for related sentences # in the corpus and building a smaller generator from these - def make_response(input, limit=140) - # First try - relevant, slightly_relevant = relevant_sentences(input) + def make_response(input, limit=140, sentences=@mentions) + # Prefer mentions + relevant, slightly_relevant = find_relevant(sentences, input) if relevant.length >= 3 generator = SuffixGenerator.build(relevant) @@ -125,6 +163,8 @@ module Ebooks elsif slightly_relevant.length >= 5 generator = SuffixGenerator.build(slightly_relevant) make_statement(limit, generator) + elsif sentences.equal?(@mentions) + make_response(input, limit, @sentences) else make_statement(limit) end diff --git a/lib/twitter_ebooks/suffix.rb b/lib/twitter_ebooks/suffix.rb index df89e0d..9a4bd04 100755 --- a/lib/twitter_ebooks/suffix.rb +++ b/lib/twitter_ebooks/suffix.rb @@ -44,7 +44,7 @@ module Ebooks verbatim = [tokens] # Verbatim sentences to avoid reproducing 0.upto(passes-1) do - puts NLP.reconstruct(tokens) + log NLP.reconstruct(tokens) if $debug varsites = {} # Map bigram start site => next token alternatives tokens.each_with_index do |token, i|