104 lines
3.3 KiB
Ruby
104 lines
3.3 KiB
Ruby
# encoding: utf-8
|
|
|
|
module Ebooks
|
|
# This generator uses data similar to a Markov model, but
|
|
# instead of making a chain by looking up bigrams it uses the
|
|
# positions to randomly replace token array suffixes in one sentence
|
|
# with matching suffixes in another
|
|
class SuffixGenerator
|
|
# Build a generator from a corpus of tikified sentences
|
|
# "tikis" are token indexes-- a way of representing words
|
|
# and punctuation as their integer position in a big array
|
|
# of such tokens
|
|
# @param sentences [Array<Array<Integer>>]
|
|
# @return [SuffixGenerator]
|
|
def self.build(sentences)
|
|
SuffixGenerator.new(sentences)
|
|
end
|
|
|
|
def initialize(sentences)
|
|
@sentences = sentences.reject { |s| s.empty? }
|
|
@unigrams = {}
|
|
@bigrams = {}
|
|
|
|
@sentences.each_with_index do |tikis, i|
|
|
if (i % 10000 == 0) then
|
|
log ("Building: sentence #{i} of #{sentences.length}")
|
|
end
|
|
last_tiki = INTERIM
|
|
tikis.each_with_index do |tiki, j|
|
|
@unigrams[last_tiki] ||= []
|
|
@unigrams[last_tiki] << [i, j]
|
|
|
|
@bigrams[last_tiki] ||= {}
|
|
@bigrams[last_tiki][tiki] ||= []
|
|
|
|
if j == tikis.length-1 # Mark sentence endings
|
|
@unigrams[tiki] ||= []
|
|
@unigrams[tiki] << [i, INTERIM]
|
|
@bigrams[last_tiki][tiki] << [i, INTERIM]
|
|
else
|
|
@bigrams[last_tiki][tiki] << [i, j+1]
|
|
end
|
|
|
|
last_tiki = tiki
|
|
end
|
|
end
|
|
|
|
self
|
|
end
|
|
|
|
# Generate a recombined sequence of tikis
|
|
# @param passes [Integer] number of times to recombine
|
|
# @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
|
|
# @return [Array<Integer>]
|
|
def generate(passes=5, n=:unigrams)
|
|
index = rand(@sentences.length)
|
|
tikis = @sentences[index]
|
|
used = [index] # Sentences we've already used
|
|
verbatim = [tikis] # Verbatim sentences to avoid reproducing
|
|
|
|
0.upto(passes-1) do
|
|
varsites = {} # Map bigram start site => next tiki alternatives
|
|
|
|
tikis.each_with_index do |tiki, i|
|
|
next_tiki = tikis[i+1]
|
|
break if next_tiki.nil?
|
|
|
|
alternatives = (n == :unigrams) ? @unigrams[next_tiki] : @bigrams[tiki][next_tiki]
|
|
# Filter out suffixes from previous sentences
|
|
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
|
varsites[i] = alternatives unless alternatives.empty?
|
|
end
|
|
|
|
variant = nil
|
|
varsites.to_a.shuffle.each do |site|
|
|
start = site[0]
|
|
|
|
site[1].shuffle.each do |alt|
|
|
verbatim << @sentences[alt[0]]
|
|
suffix = @sentences[alt[0]][alt[1]..-1]
|
|
potential = tikis[0..start+1] + suffix
|
|
|
|
# Ensure we're not just rebuilding some segment of another sentence
|
|
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
|
used << alt[0]
|
|
variant = potential
|
|
break
|
|
end
|
|
end
|
|
|
|
break if variant
|
|
end
|
|
|
|
# If we failed to produce a variation from any alternative, there
|
|
# is no use running additional passes-- they'll have the same result.
|
|
break if variant.nil?
|
|
|
|
tikis = variant
|
|
end
|
|
|
|
tikis
|
|
end
|
|
end
|
|
end
|