2.0.8 -- different generation algorithm

2013-11-14 07:44:05 -08:00 · 2013-11-14 07:44:05 -08:00 · 00f0228dd4
commit 00f0228dd4
parent e87dc5862b
24 changed files with 127 additions and 27 deletions
--- a/lib/twitter_ebooks/suffix.rb
+++ b/lib/twitter_ebooks/suffix.rb
@ -0,0 +1,82 @@
+module Ebooks
+  class SuffixGenerator
+    def self.build(sentences)
+      SuffixGenerator.new(sentences)
+    end
+
+    def initialize(sentences)
+      @sentences = sentences.reject { |s| s.length < 2 }
+      @unigrams = {}
+      @bigrams = {}
+
+      @sentences.each_with_index do |tokens, i|
+        last_token = INTERIM
+        tokens.each_with_index do |token, j|
+          @unigrams[last_token] ||= []
+          @unigrams[last_token] << [i, j]
+
+          @bigrams[last_token] ||= {}
+          @bigrams[last_token][token] ||= []
+
+          if j == tokens.length-1 # Mark sentence endings
+            @unigrams[token] ||= []
+            @unigrams[token] << [i, INTERIM]
+            @bigrams[last_token][token] << [i, INTERIM]
+          else
+            @bigrams[last_token][token] << [i, j+1]
+          end
+
+          last_token = token
+        end
+      end
+
+      self
+    end
+
+    def generate(passes=5, n=:unigrams)
+      index = rand(@sentences.length)
+      tokens = @sentences[index]
+      used = [index] # Sentences we've already used
+      verbatim = [tokens] # Verbatim sentences to avoid reproducing
+
+      0.upto(passes-1) do
+        puts NLP.reconstruct(tokens)
+        varsites = {} # Map bigram start site => next token alternatives
+
+        tokens.each_with_index do |token, i|
+          next_token = tokens[i+1]
+          break if next_token.nil?
+          
+          alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
+          alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
+          varsites[i] = alternatives unless alternatives.empty?
+        end
+
+        variant = nil
+        varsites.to_a.shuffle.each do |site|
+          start = site[0]
+
+          site[1].shuffle.each do |alt|
+            start, alt = site[0], site[1].sample
+            verbatim << @sentences[alt[0]]
+            suffix = @sentences[alt[0]][alt[1]..-1]
+            potential = tokens[0..start+1] + suffix
+            
+            unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
+              used << alt[0]
+              variant = potential
+              break
+            end
+          end
+
+          break if variant
+        end
+
+        tokens = variant if variant
+      end
+
+
+      tokens
+    end
+  end
+end