Lots of documentation and cleanup

2014-12-05 21:12:39 +11:00 · 2014-12-05 21:12:39 +11:00 · 1977445b1c
commit 1977445b1c
parent efde0fd16f
11 changed files with 237 additions and 178 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
 .*.swp
 Gemfile.lock
 pkg
+.yardoc
+doc
--- a/bin/ebooks
+++ b/bin/ebooks
@ -4,8 +4,6 @@
 require 'twitter_ebooks'
 require 'ostruct'

-$debug = true
-
 module Ebooks::CLI
  APP_PATH = Dir.pwd # XXX do some recursive thing instead
  HELP = OpenStruct.new
--- a/lib/twitter_ebooks.rb
+++ b/lib/twitter_ebooks.rb
@ -15,7 +15,6 @@ end

 require 'twitter_ebooks/nlp'
 require 'twitter_ebooks/archive'
-require 'twitter_ebooks/markov'
 require 'twitter_ebooks/suffix'
 require 'twitter_ebooks/model'
 require 'twitter_ebooks/bot'
--- a/lib/twitter_ebooks/bot.rb
+++ b/lib/twitter_ebooks/bot.rb
@ -6,10 +6,11 @@ module Ebooks
  class ConfigurationError < Exception
  end

+  # Information about a particular Twitter user we know
  class UserInfo
    attr_reader :username

-    # number of times we've interacted with a timeline tweet, unprompted
+    # @return [Integer] how many times we can pester this user unprompted
    attr_accessor :pesters_left

    def initialize(username)
@ -17,6 +18,7 @@ module Ebooks
      @pesters_left = 1
    end

+    # @return [Boolean] true if we're allowed to pester this user
    def can_pester?
      @pesters_left > 0
    end
@ -32,6 +34,7 @@ module Ebooks
      @last_update = Time.now
    end

+    # @param tweet [Twitter::Tweet] tweet to add
    def add(tweet)
      @tweets << tweet
      @last_update = Time.now
@ -61,14 +64,24 @@ module Ebooks

  # Meta information about a tweet that we calculate for ourselves
  class TweetMeta
-    attr_accessor :mentions # array: usernames mentioned in tweet
-    attr_accessor :mentionless # string: text of tweet with mentions removed
-    attr_accessor :reply_mentions # array: usernames to include in a reply
-    attr_accessor :reply_prefix # string: processed string to start reply with
-    attr_accessor :limit # integer: available room to calculate reply
+    # @return [Array<String>] usernames mentioned in tweet
+    attr_accessor :mentions
+    # @return [String] text of tweets with mentions removed
+    attr_accessor :mentionless
+    # @return [Array<String>] usernames to include in a reply
+    attr_accessor :reply_mentions
+    # @return [String] mentions to start reply with
+    attr_accessor :reply_prefix
+    # @return [Integer] available chars for reply
+    attr_accessor :limit

-    attr_accessor :bot, :tweet
+    # @return [Ebooks::Bot] associated bot
+    attr_accessor :bot
+    # @return [Twitter::Tweet] associated tweet
+    attr_accessor :tweet

+    # Check whether this tweet mentions our bot
+    # @return [Boolean]
    def mentions_bot?
      # To check if this is someone talking to us, ensure:
      # - The tweet mentions list contains our username
@ -110,47 +123,65 @@ module Ebooks
  end

  class Bot
-    attr_accessor :consumer_key, :consumer_secret,
-                  :access_token, :access_token_secret
-
-    attr_reader :twitter, :stream, :thread
-
-    # Configuration
-    attr_accessor :username, :delay_range, :blacklist
-
+    # @return [String] OAuth consumer key for a Twitter app
+    attr_accessor :consumer_key
+    # @return [String] OAuth consumer secret for a Twitter app
+    attr_accessor :consumer_secret
+    # @return [String] OAuth access token from `ebooks auth`
+    attr_accessor :access_token
+    # @return [String] OAuth access secret from `ebooks auth`
+    attr_accessor :access_token_secret
+    # @return [String] Twitter username of bot
+    attr_accessor :username
+    # @return [Array<String>] list of usernames to block on contact
+    attr_accessor :blacklist
+    # @return [Hash{String => Ebooks::Conversation}] maps tweet ids to their conversation contexts
    attr_accessor :conversations
+    # @return [Range, Integer] range of seconds to delay in delay method
+    attr_accessor :delay

-    @@all = [] # List of all defined bots
-    def self.all; @@all; end
+    # @return [Array] list of all defined bots
+    def self.all; @@all ||= []; end

-    def self.get(name)
-      all.find { |bot| bot.username == name }
+    # Fetches a bot by username
+    # @param username [String]
+    # @return [Ebooks::Bot]
+    def self.get(username)
+      all.find { |bot| bot.username == username }
    end

+    # Logs info to stdout in the context of this bot
    def log(*args)
      STDOUT.print "@#{@username}: " + args.map(&:to_s).join(' ') + "\n"
      STDOUT.flush
    end

-    def initialize(*args, &b)
-      @username ||= nil
+    # Initializes and configures bot
+    # @param args Arguments passed to configure method
+    # @param b Block to call with new bot
+    def initialize(username, &b)
      @blacklist ||= []
-      @delay_range ||= 0
-
-      @users ||= {}
+      @userinfo ||= {}
      @conversations ||= {}
-      configure(*args, &b)
-
      # Tweet ids we've already observed, to avoid duplication
      @seen_tweets ||= {}
+
+      @username = username
+      configure(*args, &b)
+
      Bot.all << self
    end

+    # Find information we've collected about a user
+    # @param username [String]
+    # @return [Ebooks::UserInfo]
    def userinfo(username)
-      @users[username] ||= UserInfo.new(username)
+      @userinfo[username] ||= UserInfo.new(username)
    end

-    # Grab or create the conversation context for this tweet
+    # Find or create the conversation context for this tweet
+    # @param tweet [Twitter::Tweet]
+    # @return [Ebooks::Conversation]
    def conversation(tweet)
      conv = if tweet.in_reply_to_status_id?
        @conversations[tweet.in_reply_to_status_id]
@ -175,6 +206,7 @@ module Ebooks
      conv
    end

+    # @return [Twitter::REST::Client] underlying REST client from twitter gem
    def twitter
      @twitter ||= Twitter::REST::Client.new do |config|
        config.consumer_key = @consumer_key
@ -184,6 +216,7 @@ module Ebooks
      end
    end

+    # @return [Twitter::Streaming::Client] underlying streaming client from twitter gem
    def stream
      @stream ||= Twitter::Streaming::Client.new do |config|
        config.consumer_key = @consumer_key
@ -194,11 +227,14 @@ module Ebooks
    end

    # Calculate some meta information about a tweet relevant for replying
+    # @param ev [Twitter::Tweet]
+    # @return [Ebooks::TweetMeta]
    def calc_meta(ev)
      TweetMeta.new(self, ev)
    end

    # Receive an event from the twitter stream
+    # @param ev [Object] Twitter streaming event
    def receive_event(ev)
      if ev.is_a? Array # Initial array sent on first connection
        log "Online!"
@ -250,14 +286,7 @@ module Ebooks
      end
    end

-    def start_stream
-      log "starting tweet stream"
-
-      stream.user do |ev|
-        receive_event ev
-      end
-    end
-
+    # Configures client and fires startup event
    def prepare
      # Sanity check
      if @username.nil?
@ -268,12 +297,18 @@ module Ebooks
      fire(:startup)
    end

-    # Connects to tweetstream and opens event handlers for this bot
+    # Start running user event stream
    def start
-      start_stream
+      log "starting tweet stream"
+
+      stream.user do |ev|
+        receive_event ev
+      end
    end

    # Fire an event
+    # @param event [Symbol] event to fire
+    # @param args arguments for event handler
    def fire(event, *args)
      handler = "on_#{event}".to_sym
      if respond_to? handler
@ -281,11 +316,17 @@ module Ebooks
      end
    end

-    def delay(&b)
-      time = @delay.to_a.sample unless @delay.is_a? Integer
+    # Delay an action for a variable period of time
+    # @param range [Range, Integer] range of seconds to choose for delay
+    def delay(range=@delay_range, &b)
+      time = range.to_a.sample unless range.is_a? Integer
      sleep time
+      b.call
    end

+    # Check if a username is blacklisted
+    # @param username [String]
+    # @return [Boolean]
    def blacklisted?(username)
      if @blacklist.include?(username)
        true
@ -295,6 +336,9 @@ module Ebooks
    end

    # Reply to a tweet or a DM.
+    # @param ev [Twitter::Tweet, Twitter::DirectMessage]
+    # @param text [String] contents of reply excluding reply_prefix
+    # @param opts [Hash] additional params to pass to twitter gem
    def reply(ev, text, opts={})
      opts = opts.clone

@ -306,26 +350,28 @@ module Ebooks

        if conversation(ev).is_bot?(ev.user.screen_name)
          log "Not replying to suspected bot @#{ev.user.screen_name}"
-          return
+          return false
        end

        if !meta.mentions_bot?
          if !userinfo(ev.user.screen_name).can_pester?
            log "Not replying: leaving @#{ev.user.screen_name} alone"
-            return
+            return false
          end
        end

        log "Replying to @#{ev.user.screen_name} with: #{meta.reply_prefix + text}"
        tweet = twitter.update(meta.reply_prefix + text, in_reply_to_status_id: ev.id)
        conversation(tweet).add(tweet)
+        tweet
      else
        raise Exception("Don't know how to reply to a #{ev.class}")
      end
    end

+    # Favorite a tweet
+    # @param tweet [Twitter::Tweet]
    def favorite(tweet)
-      return if blacklisted?(tweet.user.screen_name)
      log "Favoriting @#{tweet.user.screen_name}: #{tweet.text}"

      begin
@ -335,6 +381,8 @@ module Ebooks
      end
    end

+    # Retweet a tweet
+    # @param tweet [Twitter::Tweet]
    def retweet(tweet)
      log "Retweeting @#{tweet.user.screen_name}: #{tweet.text}"

@ -345,26 +393,36 @@ module Ebooks
      end
    end

-    def follow(*args)
-      log "Following #{args}"
-      twitter.follow(*args)
+    # Follow a user
+    # @param user [String] username or user id
+    def follow(user, *args)
+      log "Following #{user}"
+      twitter.follow(user, *args)
    end

-    def unfollow(*args)
-      log "Unfollowing #{args}"
-      twiter.unfollow(*args)
+    # Unfollow a user
+    # @param user [String] username or user id
+    def unfollow(user, *args)
+      log "Unfollowing #{user}"
+      twiter.unfollow(user, *args)
    end

-    def tweet(*args)
-      log "Tweeting #{args.inspect}"
-      twitter.update(*args)
+    # Tweet something
+    # @param text [String]
+    def tweet(text, *args)
+      log "Tweeting '#{text}'"
+      twitter.update(text, *args)
    end

+    # Get a scheduler for this bot
+    # @return [Rufus::Scheduler]
    def scheduler
      @scheduler ||= Rufus::Scheduler.new
    end

-    # could easily just be *args however the separation keeps it clean.
+    # Tweet some text with an image
+    # @param txt [String]
+    # @param pic [String] filename
    def pictweet(txt, pic, *args)
      log "Tweeting #{txt.inspect} - #{pic} #{args}"
      twitter.update_with_media(txt, File.new(pic), *args)
--- a/lib/twitter_ebooks/markov.rb
+++ b/lib/twitter_ebooks/markov.rb
@ -1,82 +0,0 @@
-module Ebooks
-  # Special INTERIM token represents sentence boundaries
-  # This is so we can include start and end of statements in model
-  # Due to the way the sentence tokenizer works, can correspond
-  # to multiple actual parts of text (such as ^, $, \n and .?!)
-  INTERIM = :interim
-
-  # This is an ngram-based Markov model optimized to build from a
-  # tokenized sentence list without requiring too much transformation
-  class MarkovModel
-    def self.build(sentences)
-      MarkovModel.new.consume(sentences)
-    end
-
-    def consume(sentences)
-      # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
-      # We map by both bigrams and unigrams so we can fall back to the latter in
-      # cases where an input bigram is unavailable, such as starting a sentence
-      @sentences = sentences
-      @unigrams = {}
-      @bigrams = {}
-
-      sentences.each_with_index do |tokens, i|
-        last_token = INTERIM
-        tokens.each_with_index do |token, j|
-          @unigrams[last_token] ||= []
-          @unigrams[last_token] << [i, j]
-
-          @bigrams[last_token] ||= {}
-          @bigrams[last_token][token] ||= []
-
-          if j == tokens.length-1 # Mark sentence endings
-            @unigrams[token] ||= []
-            @unigrams[token] << INTERIM
-            @bigrams[last_token][token] << INTERIM
-          else
-            @bigrams[last_token][token] << [i, j+1]
-          end
-
-          last_token = token
-        end
-      end
-
-      self
-    end
-
-    def find_token(index)
-      if index == INTERIM
-        INTERIM
-      else
-        @sentences[index[0]][index[1]]
-      end
-    end
-
-    def chain(tokens)
-      if tokens.length == 1
-        matches = @unigrams[tokens[-1]]
-      else
-        matches = @bigrams[tokens[-2]][tokens[-1]]
-        matches = @unigrams[tokens[-1]] if matches.length < 2
-      end
-
-      if matches.empty?
-        # This should never happen unless a strange token is
-        # supplied from outside the dataset
-        raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
-      end
-
-      next_token = find_token(matches.sample)
-
-      if next_token == INTERIM # We chose to end the sentence
-        return tokens
-      else
-        return chain(tokens + [next_token])
-      end
-    end
-
-    def generate
-      NLP.reconstruct(chain([INTERIM]))
-    end
-  end
-end
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -8,16 +8,41 @@ require 'csv'

 module Ebooks
  class Model
-    attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
+    # @return [Array<String>]
+    # An array of unique tokens. This is the main source of actual strings
+    # in the model. Manipulation of a token is done using its index
+    # in this array, which we call a "tiki"
+    attr_accessor :tokens

-    def self.consume(txtpath)
-      Model.new.consume(txtpath)
+    # @return [Array<Array<Integer>>]
+    # Sentences represented by arrays of tikis
+    attr_accessor :sentences
+
+    # @return [Array<Array<Integer>>]
+    # Sentences derived from Twitter mentions
+    attr_accessor :mentions
+
+    # @return [Array<String>]
+    # The top 200 most important keywords, in descending order
+    attr_accessor :keywords
+
+    # Generate a new model from a corpus file
+    # @param path [String]
+    # @return [Ebooks::Model]
+    def self.consume(path)
+      Model.new.consume(path)
    end

+    # Generate a new model from multiple corpus files
+    # @param paths [Array<String>]
+    # @return [Ebooks::Model]
    def self.consume_all(paths)
      Model.new.consume_all(paths)
    end

+    # Load a saved model
+    # @param path [String]
+    # @return [Ebooks::Model]
    def self.load(path)
      model = Model.new
      model.instance_eval do
@ -30,6 +55,8 @@ module Ebooks
      model
    end

+    # Save model to a file
+    # @param path [String]
    def save(path)
      File.open(path, 'wb') do |f|
        f.write(Marshal.dump({
@ -43,19 +70,22 @@ module Ebooks
    end

    def initialize
-      # This is the only source of actual strings in the model. It is
-      # an array of unique tokens. Manipulation of a token is mostly done
-      # using its index in this array, which we call a "tiki"
      @tokens = []

      # Reverse lookup tiki by token, for faster generation
      @tikis = {}
    end

+    # Reverse lookup a token index from a token
+    # @param token [String]
+    # @return [Integer]
    def tikify(token)
      @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
    end

+    # Convert a body of text into arrays of tikis
+    # @param text [String]
+    # @return [Array<Array<Integer>>]
    def mass_tikify(text)
      sentences = NLP.sentences(text)

@ -69,9 +99,10 @@ module Ebooks
      end
    end

+    # Consume a corpus into this model
+    # @param path [String]
    def consume(path)
      content = File.read(path, :encoding => 'utf-8')
-      @hash = Digest::MD5.hexdigest(content)

      if path.split('.')[-1] == "json"
        log "Reading json corpus from #{path}"
@ -94,6 +125,8 @@ module Ebooks
      consume_lines(lines)
    end

+    # Consume a sequence of lines
+    # @param lines [Array<String>]
    def consume_lines(lines)
      log "Removing commented lines and sorting mentions"

@ -126,11 +159,12 @@ module Ebooks
      self
    end

+    # Consume multiple corpuses into this model
+    # @param paths [Array<String>]
    def consume_all(paths)
      lines = []
      paths.each do |path|
        content = File.read(path, :encoding => 'utf-8')
-        @hash = Digest::MD5.hexdigest(content)

        if path.split('.')[-1] == "json"
          log "Reading json corpus from #{path}"
@ -156,25 +190,26 @@ module Ebooks
      consume_lines(lines)
    end

-    def fix(tweet)
-      # This seems to require an external api call
-      #begin
-      #  fixer = NLP.gingerice.parse(tweet)
-      #  log fixer if fixer['corrections']
-      #  tweet = fixer['result']
-      #rescue Exception => e
-      #  log e.message
-      #  log e.backtrace
-      #end
-
-      NLP.htmlentities.decode tweet
+    # Correct encoding issues in generated text
+    # @param text [String]
+    # @return [String]
+    def fix(text)
+      NLP.htmlentities.decode text
    end

+    # Check if an array of tikis comprises a valid tweet
+    # @param tikis [Array<Integer>]
+    # @param limit Integer how many chars we have left
    def valid_tweet?(tikis, limit)
      tweet = NLP.reconstruct(tikis, @tokens)
      tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
    end

+    # Generate some text
+    # @param limit [Integer] available characters
+    # @param generator [SuffixGenerator, nil]
+    # @param retry_limit [Integer] how many times to retry on duplicates
+    # @return [String]
    def make_statement(limit=140, generator=nil, retry_limit=10)
      responding = !generator.nil?
      generator ||= SuffixGenerator.build(@sentences)
@ -209,12 +244,17 @@ module Ebooks
    end

    # Test if a sentence has been copied verbatim from original
-    def verbatim?(tokens)
-      @sentences.include?(tokens) || @mentions.include?(tokens)
+    # @param tikis [Array<Integer>]
+    # @return [Boolean]
+    def verbatim?(tikis)
+      @sentences.include?(tikis) || @mentions.include?(tikis)
    end

-    # Finds all relevant tokenized sentences to given input by
+    # Finds relevant and slightly relevant tokenized sentences to input
    # comparing non-stopword token overlaps
+    # @param sentences [Array<Array<Integer>>]
+    # @param input [String]
+    # @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
    def find_relevant(sentences, input)
      relevant = []
      slightly_relevant = []
@ -235,6 +275,10 @@ module Ebooks

    # Generates a response by looking for related sentences
    # in the corpus and building a smaller generator from these
+    # @param input [String]
+    # @param limit [Integer] characters available for response
+    # @param sentences [Array<Array<Integer>>]
+    # @return [String]
    def make_response(input, limit=140, sentences=@mentions)
      # Prefer mentions
      relevant, slightly_relevant = find_relevant(sentences, input)
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -12,31 +12,35 @@ module Ebooks
    # Some of this stuff is pretty heavy and we don't necessarily need
    # to be using it all of the time

+    # Lazily loads an array of stopwords
+    # Stopwords are common English words that should often be ignored
+    # @return [Array<String>]
    def self.stopwords
      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
    end

+    # Lazily loads an array of known English nouns
+    # @return [Array<String>]
    def self.nouns
      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
    end

+    # Lazily loads an array of known English adjectives
+    # @return [Array<String>]
    def self.adjectives
      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
    end

-    # POS tagger
+    # Lazily load part-of-speech tagging library
+    # This can determine whether a word is being used as a noun/adjective/verb
+    # @return [EngTagger]
    def self.tagger
      require 'engtagger'
      @tagger ||= EngTagger.new
    end

-    # Gingerice text correction service
-    def self.gingerice
-      require 'gingerice'
-      Gingerice::Parser.new # No caching for this one
-    end
-
-    # For decoding html entities
+    # Lazily load HTML entity decoder
+    # @return [HTMLEntities]
    def self.htmlentities
      require 'htmlentities'
      @htmlentities ||= HTMLEntities.new
@ -44,7 +48,9 @@ module Ebooks

    ### Utility functions

-    # We don't really want to deal with all this weird unicode punctuation
+    # Normalize some strange unicode punctuation variants
+    # @param text [String]
+    # @return [String]
    def self.normalize(text)
      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
    end
@ -53,6 +59,8 @@ module Ebooks
    # We use ad hoc approach because fancy libraries do not deal
    # especially well with tweet formatting, and we can fake solving
    # the quote problem during generation
+    # @param text [String]
+    # @return [Array<String>]
    def self.sentences(text)
      text.split(/\n+|(?<=[.?!])\s+/)
    end
@ -60,15 +68,23 @@ module Ebooks
    # Split a sentence into word-level tokens
    # As above, this is ad hoc because tokenization libraries
    # do not behave well wrt. things like emoticons and timestamps
+    # @param sentence [String]
+    # @return [Array<String>]
    def self.tokenize(sentence)
      regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
      sentence.split(regex)
    end

+    # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
+    # @param word [String]
+    # @return [String]
    def self.stem(word)
      Stemmer::stem_word(word.downcase)
    end

+    # Use highscore gem to find interesting keywords in a corpus
+    # @param text [String]
+    # @return [Highscore::Keywords]
    def self.keywords(text)
      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
      text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
@ -90,7 +106,10 @@ module Ebooks
      text.keywords
    end

-    # Takes a list of tokens and builds a nice-looking sentence
+    # Builds a proper sentence from a list of tikis
+    # @param tikis [Array<Integer>]
+    # @param tokens [Array<String>]
+    # @return [String]
    def self.reconstruct(tikis, tokens)
      text = ""
      last_token = nil
@ -105,6 +124,9 @@ module Ebooks
    end

    # Determine if we need to insert a space between two tokens
+    # @param token1 [String]
+    # @param token2 [String]
+    # @return [Boolean]
    def self.space_between?(token1, token2)
      p1 = self.punctuation?(token1)
      p2 = self.punctuation?(token2)
@ -119,10 +141,16 @@ module Ebooks
      end
    end

+    # Is this token comprised of punctuation?
+    # @param token [String]
+    # @return [Boolean]
    def self.punctuation?(token)
      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
    end

+    # Is this token a stopword?
+    # @param token [String]
+    # @return [Boolean]
    def self.stopword?(token)
      @stopword_set ||= stopwords.map(&:downcase).to_set
      @stopword_set.include?(token.downcase)
@ -130,7 +158,9 @@ module Ebooks

    # Determine if a sample of text contains unmatched brackets or quotes
    # This is one of the more frequent and noticeable failure modes for
-    # the markov generator; we can just tell it to retry
+    # the generator; we can just tell it to retry
+    # @param text [String]
+    # @return [Boolean]
    def self.unmatched_enclosers?(text)
      enclosers = ['**', '""', '()', '[]', '``', "''"]
      enclosers.each do |pair|
@ -153,10 +183,13 @@ module Ebooks
    end

    # Determine if a2 is a subsequence of a1
+    # @param a1 [Array]
+    # @param a2 [Array]
+    # @return [Boolean]
    def self.subseq?(a1, a2)
-      a1.each_index.find do |i|
+      !a1.each_index.find do |i|
        a1[i...i+a2.length] == a2
-      end
+      end.nil?
    end
  end
 end
--- a/lib/twitter_ebooks/suffix.rb
+++ b/lib/twitter_ebooks/suffix.rb
@ -1,11 +1,14 @@
 # encoding: utf-8

 module Ebooks
-  # This generator uses data identical to the markov model, but
+  # This generator uses data identical to a markov model, but
  # instead of making a chain by looking up bigrams it uses the
  # positions to randomly replace suffixes in one sentence with
  # matching suffixes in another
  class SuffixGenerator
+    # Build a generator from a corpus of tikified sentences
+    # @param sentences [Array<Array<Integer>>]
+    # @return [SuffixGenerator]
    def self.build(sentences)
      SuffixGenerator.new(sentences)
    end
@ -39,6 +42,11 @@ module Ebooks
      self
    end

+
+    # Generate a recombined sequence of tikis
+    # @param passes [Integer] number of times to recombine
+    # @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
+    # @return [Array<Integer>]
    def generate(passes=5, n=:unigrams)
      index = rand(@sentences.length)
      tikis = @sentences[index]
--- a/lib/twitter_ebooks/version.rb
+++ b/lib/twitter_ebooks/version.rb
@ -1,3 +1,3 @@
 module Ebooks
-  VERSION = "2.3.2"
+  VERSION = "3.0.0"
 end
--- a/spec/bot_spec.rb
+++ b/spec/bot_spec.rb
@ -3,8 +3,6 @@ require 'memory_profiler'
 require 'tempfile'
 require 'timecop'

-def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
-
 class TestBot < Ebooks::Bot
  attr_accessor :twitter

--- a/twitter_ebooks.gemspec
+++ b/twitter_ebooks.gemspec
@ -20,6 +20,7 @@ Gem::Specification.new do |gem|
  gem.add_development_dependency 'memory_profiler'
  gem.add_development_dependency 'timecop'
  gem.add_development_dependency 'pry-byebug'
+  gem.add_development_dependency 'yard'

  gem.add_runtime_dependency 'twitter', '~> 5.0'
  gem.add_runtime_dependency 'simple_oauth'