Github time!

2013-11-08 06:02:05 +11:00 · 2013-11-08 06:02:05 +11:00 · e87dc5862b
commit e87dc5862b
27 changed files with 20178 additions and 0 deletions
--- a/lib/twitter_ebooks/markov.rb
+++ b/lib/twitter_ebooks/markov.rb
@ -0,0 +1,81 @@
+module Ebooks
+  # Special INTERIM token represents sentence boundaries
+  # This is so we can include start and end of statements in model
+  # Due to the way the sentence tokenizer works, can correspond
+  # to multiple actual parts of text (such as ^, $, \n and .?!)
+  INTERIM = :interim
+
+  # This is an ngram-based Markov model optimized to build from a
+  # tokenized sentence list without requiring too much transformation
+  class MarkovModel
+    def self.build(sentences)
+      MarkovModel.new.consume(sentences)
+    end
+
+    def consume(sentences)
+      # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
+      # We map by both bigrams and unigrams so we can fall back to the latter in
+      # cases where an input bigram is unavailable, such as starting a sentence
+      @sentences = sentences
+      @unigrams = {}
+      @bigrams = {}
+
+      sentences.each_with_index do |tokens, i|
+        last_token = INTERIM
+        tokens.each_with_index do |token, j|
+          @unigrams[last_token] ||= []
+          @unigrams[last_token] << [i, j]
+
+          @bigrams[last_token] ||= {}
+          @bigrams[last_token][token] ||= []
+
+          if j == tokens.length-1 # Mark sentence endings
+            @unigrams[token] ||= []
+            @unigrams[token] << INTERIM
+            @bigrams[last_token][token] << INTERIM
+          else
+            @bigrams[last_token][token] << [i, j+1]
+          end
+
+          last_token = token
+        end
+      end
+
+      self
+    end
+
+    def find_token(index)
+      if index == INTERIM
+        INTERIM
+      else
+        @sentences[index[0]][index[1]]
+      end
+    end
+
+    def chain(tokens)
+      if tokens.length == 1
+        matches = @unigrams[tokens[0]]
+      else
+        matches = @bigrams[tokens[-2]][tokens[-1]]
+      end
+
+      if matches.empty?
+        # This should never happen unless a strange token is
+        # supplied from outside the dataset
+        raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
+      end
+
+      next_token = find_token(matches.sample)
+
+      if next_token == INTERIM # We chose to end the sentence
+        return tokens
+      else
+        return chain(tokens + [next_token])
+      end
+    end
+
+    def generate
+      NLP.reconstruct(chain([INTERIM]))
+    end
+  end
+end