This commit is contained in:
Mispy 2013-11-14 08:30:45 -08:00
parent 00f0228dd4
commit e4209f79e4

View file

@ -1,4 +1,8 @@
module Ebooks module Ebooks
# This generator uses data identical to the markov model, but
# instead of making a chain by looking up bigrams it uses the
# positions to randomly replace suffixes in one sentence with
# matching suffixes in another
class SuffixGenerator class SuffixGenerator
def self.build(sentences) def self.build(sentences)
SuffixGenerator.new(sentences) SuffixGenerator.new(sentences)
@ -48,6 +52,7 @@ module Ebooks
break if next_token.nil? break if next_token.nil?
alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token] alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
# Filter out suffixes from previous sentences
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) } alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
varsites[i] = alternatives unless alternatives.empty? varsites[i] = alternatives unless alternatives.empty?
end end
@ -62,6 +67,7 @@ module Ebooks
suffix = @sentences[alt[0]][alt[1]..-1] suffix = @sentences[alt[0]][alt[1]..-1]
potential = tokens[0..start+1] + suffix potential = tokens[0..start+1] + suffix
# Ensure we're not just rebuilding some segment of another sentence
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) } unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
used << alt[0] used << alt[0]
variant = potential variant = potential
@ -75,7 +81,6 @@ module Ebooks
tokens = variant if variant tokens = variant if variant
end end
tokens tokens
end end
end end