Retry limit and mention separation

2013-11-18 02:59:15 -08:00 · 2013-11-18 02:59:15 -08:00 · 61c5caee4d
commit 61c5caee4d
parent c1d91d1693
5 changed files with 65 additions and 21 deletions
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -1,7 +1,7 @@
 PATH
  remote: .
  specs:
-    twitter_ebooks (2.0.7)
+    twitter_ebooks (2.1.0)
      engtagger
      fast-stemmer
      gingerice
--- a/bin/ebooks
+++ b/bin/ebooks
@ -2,6 +2,8 @@
 require 'twitter_ebooks'
 $debug = true
 module Ebooks
  APP_PATH = Dir.pwd # XXX do some recursive thing instead
--- a/lib/twitter_ebooks.rb
+++ b/lib/twitter_ebooks.rb
@ -1,5 +1,7 @@
 gem 'minitest'
 $debug = false
 def log(*args)
  STDERR.puts args.map(&:to_s).join(' ')
  STDERR.flush
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -7,7 +7,7 @@ require 'digest/md5'
 module Ebooks
  class Model
-    attr_accessor :hash, :sentences, :generator, :keywords
+    attr_accessor :hash, :sentences, :mentions, :keywords
    def self.consume(txtpath)
      Model.new.consume(txtpath)
@ -22,23 +22,44 @@ module Ebooks
      @hash = Digest::MD5.hexdigest(File.read(txtpath))
      text = File.read(txtpath)
-      log "Removing commented lines and mention tokens"
+      log "Removing commented lines and sorting mentions"
      lines = text.split("\n")
      keeping = []
      mentions = []
      lines.each do |l|
-        next if l.start_with?('#') || l.include?('RT')
+        next if l.start_with?('#') # Remove commented lines
-        processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
+        next if l.include?('RT') || l.include?('MT') # Remove soft retweets
-        keeping << processed.join(' ')
+        
        if l.include?('@')
          mentions << l
        else
          keeping << l
        end
      end
-      text = NLP.normalize(keeping.join("\n"))
+      text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
      mention_text = NLP.normalize(mentions.join("\n"))
      log "Segmenting text into sentences"
-      sentences = NLP.sentences(text)
+      statements = NLP.sentences(text)
      mentions = NLP.sentences(mention_text)
-      log "Tokenizing #{sentences.length} sentences"
+      log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
-      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
+      @sentences = []
      @mentions = []
      statements.each do |s|
        @sentences << NLP.tokenize(s).reject do |t|
          t.start_with?('@') || t.start_with?('http')
        end
      end
      mentions.each do |s|
        @mentions << NLP.tokenize(s).reject do |t|
          t.start_with?('@') || t.start_with?('http')
        end
      end
      log "Ranking keywords"
      @keywords = NLP.keywords(@sentences)
@ -72,38 +93,55 @@ module Ebooks
      tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
    end
-    def make_statement(limit=140, generator=nil)
+    def make_statement(limit=140, generator=nil, retry_limit=10)
      responding = !generator.nil?
      generator ||= SuffixGenerator.build(@sentences)
      retries = 0
      tweet = ""
      while (tokens = generator.generate(3, :bigrams)) do
        next if tokens.length <= 3 && !responding
        break if valid_tweet?(tokens, limit)
        retries += 1
        break if retries >= retry_limit
      end
-      if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
+      if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
        while (tokens = generator.generate(3, :unigrams)) do
-          break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
+          break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
          retries += 1
          break if retries >= retry_limit
        end
      end
      tweet = NLP.reconstruct(tokens)
      if retries >= retry_limit
        log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
      end
      fix tweet
    end
    # Test if a sentence has been copied verbatim from original
    def verbatim?(tokens)
      @sentences.include?(tokens) || @mentions.include?(tokens)
    end
    # Finds all relevant tokenized sentences to given input by
    # comparing non-stopword token overlaps
-    def relevant_sentences(input)
+    def find_relevant(sentences, input)
      relevant = []
      slightly_relevant = []
-      tokenized = NLP.tokenize(input)
+      tokenized = NLP.tokenize(input).map(&:downcase)
-      @sentences.each do |sent|
+      sentences.each do |sent|
        tokenized.each do |token|
-          if sent.include?(token)
+          if sent.map(&:downcase).include?(token)
            relevant << sent unless NLP.stopword?(token)
            slightly_relevant << sent
          end
@ -115,9 +153,9 @@ module Ebooks
    # Generates a response by looking for related sentences
    # in the corpus and building a smaller generator from these
-    def make_response(input, limit=140)
+    def make_response(input, limit=140, sentences=@mentions)
-      # First try 
+      # Prefer mentions
-      relevant, slightly_relevant = relevant_sentences(input)
+      relevant, slightly_relevant = find_relevant(sentences, input)
      if relevant.length >= 3
        generator = SuffixGenerator.build(relevant)
@ -125,6 +163,8 @@ module Ebooks
      elsif slightly_relevant.length >= 5
        generator = SuffixGenerator.build(slightly_relevant)
        make_statement(limit, generator)
      elsif sentences.equal?(@mentions)
        make_response(input, limit, @sentences)
      else
        make_statement(limit)
      end
--- a/lib/twitter_ebooks/suffix.rb
+++ b/lib/twitter_ebooks/suffix.rb
@ -44,7 +44,7 @@ module Ebooks
      verbatim = [tokens] # Verbatim sentences to avoid reproducing
      0.upto(passes-1) do
-        puts NLP.reconstruct(tokens)
+        log NLP.reconstruct(tokens) if $debug
        varsites = {} # Map bigram start site => next token alternatives
        tokens.each_with_index do |token, i|