twitter-ebooks/lib/twitter_ebooks/suffix.rb
2014-12-05 21:12:39 +11:00

96 lines
2.9 KiB
Ruby

# encoding: utf-8
module Ebooks
# This generator uses data identical to a markov model, but
# instead of making a chain by looking up bigrams it uses the
# positions to randomly replace suffixes in one sentence with
# matching suffixes in another
class SuffixGenerator
# Build a generator from a corpus of tikified sentences
# @param sentences [Array<Array<Integer>>]
# @return [SuffixGenerator]
def self.build(sentences)
SuffixGenerator.new(sentences)
end
def initialize(sentences)
@sentences = sentences.reject { |s| s.length < 2 }
@unigrams = {}
@bigrams = {}
@sentences.each_with_index do |tikis, i|
last_tiki = INTERIM
tikis.each_with_index do |tiki, j|
@unigrams[last_tiki] ||= []
@unigrams[last_tiki] << [i, j]
@bigrams[last_tiki] ||= {}
@bigrams[last_tiki][tiki] ||= []
if j == tikis.length-1 # Mark sentence endings
@unigrams[tiki] ||= []
@unigrams[tiki] << [i, INTERIM]
@bigrams[last_tiki][tiki] << [i, INTERIM]
else
@bigrams[last_tiki][tiki] << [i, j+1]
end
last_tiki = tiki
end
end
self
end
# Generate a recombined sequence of tikis
# @param passes [Integer] number of times to recombine
# @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
# @return [Array<Integer>]
def generate(passes=5, n=:unigrams)
index = rand(@sentences.length)
tikis = @sentences[index]
used = [index] # Sentences we've already used
verbatim = [tikis] # Verbatim sentences to avoid reproducing
0.upto(passes-1) do
varsites = {} # Map bigram start site => next tiki alternatives
tikis.each_with_index do |tiki, i|
next_tiki = tikis[i+1]
break if next_tiki.nil?
alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
# Filter out suffixes from previous sentences
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
varsites[i] = alternatives unless alternatives.empty?
end
variant = nil
varsites.to_a.shuffle.each do |site|
start = site[0]
site[1].shuffle.each do |alt|
start, alt = site[0], site[1].sample
verbatim << @sentences[alt[0]]
suffix = @sentences[alt[0]][alt[1]..-1]
potential = tikis[0..start+1] + suffix
# Ensure we're not just rebuilding some segment of another sentence
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
used << alt[0]
variant = potential
break
end
end
break if variant
end
tikis = variant if variant
end
tikis
end
end
end