Github time!
This commit is contained in:
commit
e87dc5862b
27 changed files with 20178 additions and 0 deletions
81
lib/twitter_ebooks/markov.rb
Normal file
81
lib/twitter_ebooks/markov.rb
Normal file
|
@ -0,0 +1,81 @@
|
|||
module Ebooks
|
||||
# Special INTERIM token represents sentence boundaries
|
||||
# This is so we can include start and end of statements in model
|
||||
# Due to the way the sentence tokenizer works, can correspond
|
||||
# to multiple actual parts of text (such as ^, $, \n and .?!)
|
||||
INTERIM = :interim
|
||||
|
||||
# This is an ngram-based Markov model optimized to build from a
|
||||
# tokenized sentence list without requiring too much transformation
|
||||
class MarkovModel
|
||||
def self.build(sentences)
|
||||
MarkovModel.new.consume(sentences)
|
||||
end
|
||||
|
||||
def consume(sentences)
|
||||
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
|
||||
# We map by both bigrams and unigrams so we can fall back to the latter in
|
||||
# cases where an input bigram is unavailable, such as starting a sentence
|
||||
@sentences = sentences
|
||||
@unigrams = {}
|
||||
@bigrams = {}
|
||||
|
||||
sentences.each_with_index do |tokens, i|
|
||||
last_token = INTERIM
|
||||
tokens.each_with_index do |token, j|
|
||||
@unigrams[last_token] ||= []
|
||||
@unigrams[last_token] << [i, j]
|
||||
|
||||
@bigrams[last_token] ||= {}
|
||||
@bigrams[last_token][token] ||= []
|
||||
|
||||
if j == tokens.length-1 # Mark sentence endings
|
||||
@unigrams[token] ||= []
|
||||
@unigrams[token] << INTERIM
|
||||
@bigrams[last_token][token] << INTERIM
|
||||
else
|
||||
@bigrams[last_token][token] << [i, j+1]
|
||||
end
|
||||
|
||||
last_token = token
|
||||
end
|
||||
end
|
||||
|
||||
self
|
||||
end
|
||||
|
||||
def find_token(index)
|
||||
if index == INTERIM
|
||||
INTERIM
|
||||
else
|
||||
@sentences[index[0]][index[1]]
|
||||
end
|
||||
end
|
||||
|
||||
def chain(tokens)
|
||||
if tokens.length == 1
|
||||
matches = @unigrams[tokens[0]]
|
||||
else
|
||||
matches = @bigrams[tokens[-2]][tokens[-1]]
|
||||
end
|
||||
|
||||
if matches.empty?
|
||||
# This should never happen unless a strange token is
|
||||
# supplied from outside the dataset
|
||||
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
|
||||
end
|
||||
|
||||
next_token = find_token(matches.sample)
|
||||
|
||||
if next_token == INTERIM # We chose to end the sentence
|
||||
return tokens
|
||||
else
|
||||
return chain(tokens + [next_token])
|
||||
end
|
||||
end
|
||||
|
||||
def generate
|
||||
NLP.reconstruct(chain([INTERIM]))
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue