twitter-ebooks/lib/twitter_ebooks/markov.rb

module Ebooks
  # Special INTERIM token represents sentence boundaries
  # This is so we can include start and end of statements in model
  # Due to the way the sentence tokenizer works, can correspond
  # to multiple actual parts of text (such as ^, $, \n and .?!)
  INTERIM = :interim

  # This is an ngram-based Markov model optimized to build from a
  # tokenized sentence list without requiring too much transformation
  class MarkovModel
    def self.build(sentences)
      MarkovModel.new.consume(sentences)
    end

    def consume(sentences)
      # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
      # We map by both bigrams and unigrams so we can fall back to the latter in
      # cases where an input bigram is unavailable, such as starting a sentence
      @sentences = sentences
      @unigrams = {}
      @bigrams = {}

      sentences.each_with_index do |tokens, i|
        last_token = INTERIM
        tokens.each_with_index do |token, j|
          @unigrams[last_token] ||= []
          @unigrams[last_token] << [i, j]

          @bigrams[last_token] ||= {}
          @bigrams[last_token][token] ||= []

          if j == tokens.length-1 # Mark sentence endings
            @unigrams[token] ||= []
            @unigrams[token] << INTERIM
            @bigrams[last_token][token] << INTERIM
          else
            @bigrams[last_token][token] << [i, j+1]
          end

          last_token = token
        end
      end

      self
    end

    def find_token(index)
      if index == INTERIM
        INTERIM
      else
        @sentences[index[0]][index[1]]
      end
    end

    def chain(tokens)
      if tokens.length == 1
        matches = @unigrams[tokens[-1]]
      else
        matches = @bigrams[tokens[-2]][tokens[-1]]
        matches = @unigrams[tokens[-1]] if matches.length < 2
      end

      if matches.empty?
        # This should never happen unless a strange token is
        # supplied from outside the dataset
        raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
      end

      next_token = find_token(matches.sample)

      if next_token == INTERIM # We chose to end the sentence
        return tokens
      else
        return chain(tokens + [next_token])
      end
    end

    def generate
      NLP.reconstruct(chain([INTERIM]))
    end
  end
end