diff --git a/.gitignore b/.gitignore index b0436d8..f2b5f47 100755 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .*.swp Gemfile.lock pkg +.yardoc +doc diff --git a/bin/ebooks b/bin/ebooks index 7a144dc..a41681f 100755 --- a/bin/ebooks +++ b/bin/ebooks @@ -4,8 +4,6 @@ require 'twitter_ebooks' require 'ostruct' -$debug = true - module Ebooks::CLI APP_PATH = Dir.pwd # XXX do some recursive thing instead HELP = OpenStruct.new diff --git a/lib/twitter_ebooks.rb b/lib/twitter_ebooks.rb index ea857f4..b104501 100644 --- a/lib/twitter_ebooks.rb +++ b/lib/twitter_ebooks.rb @@ -15,7 +15,6 @@ end require 'twitter_ebooks/nlp' require 'twitter_ebooks/archive' -require 'twitter_ebooks/markov' require 'twitter_ebooks/suffix' require 'twitter_ebooks/model' require 'twitter_ebooks/bot' diff --git a/lib/twitter_ebooks/bot.rb b/lib/twitter_ebooks/bot.rb index 06054b1..585c96e 100755 --- a/lib/twitter_ebooks/bot.rb +++ b/lib/twitter_ebooks/bot.rb @@ -6,10 +6,11 @@ module Ebooks class ConfigurationError < Exception end + # Information about a particular Twitter user we know class UserInfo attr_reader :username - # number of times we've interacted with a timeline tweet, unprompted + # @return [Integer] how many times we can pester this user unprompted attr_accessor :pesters_left def initialize(username) @@ -17,6 +18,7 @@ module Ebooks @pesters_left = 1 end + # @return [Boolean] true if we're allowed to pester this user def can_pester? @pesters_left > 0 end @@ -32,6 +34,7 @@ module Ebooks @last_update = Time.now end + # @param tweet [Twitter::Tweet] tweet to add def add(tweet) @tweets << tweet @last_update = Time.now @@ -61,14 +64,24 @@ module Ebooks # Meta information about a tweet that we calculate for ourselves class TweetMeta - attr_accessor :mentions # array: usernames mentioned in tweet - attr_accessor :mentionless # string: text of tweet with mentions removed - attr_accessor :reply_mentions # array: usernames to include in a reply - attr_accessor :reply_prefix # string: processed string to start reply with - attr_accessor :limit # integer: available room to calculate reply + # @return [Array] usernames mentioned in tweet + attr_accessor :mentions + # @return [String] text of tweets with mentions removed + attr_accessor :mentionless + # @return [Array] usernames to include in a reply + attr_accessor :reply_mentions + # @return [String] mentions to start reply with + attr_accessor :reply_prefix + # @return [Integer] available chars for reply + attr_accessor :limit - attr_accessor :bot, :tweet + # @return [Ebooks::Bot] associated bot + attr_accessor :bot + # @return [Twitter::Tweet] associated tweet + attr_accessor :tweet + # Check whether this tweet mentions our bot + # @return [Boolean] def mentions_bot? # To check if this is someone talking to us, ensure: # - The tweet mentions list contains our username @@ -110,47 +123,65 @@ module Ebooks end class Bot - attr_accessor :consumer_key, :consumer_secret, - :access_token, :access_token_secret - - attr_reader :twitter, :stream, :thread - - # Configuration - attr_accessor :username, :delay_range, :blacklist - + # @return [String] OAuth consumer key for a Twitter app + attr_accessor :consumer_key + # @return [String] OAuth consumer secret for a Twitter app + attr_accessor :consumer_secret + # @return [String] OAuth access token from `ebooks auth` + attr_accessor :access_token + # @return [String] OAuth access secret from `ebooks auth` + attr_accessor :access_token_secret + # @return [String] Twitter username of bot + attr_accessor :username + # @return [Array] list of usernames to block on contact + attr_accessor :blacklist + # @return [Hash{String => Ebooks::Conversation}] maps tweet ids to their conversation contexts attr_accessor :conversations + # @return [Range, Integer] range of seconds to delay in delay method + attr_accessor :delay - @@all = [] # List of all defined bots - def self.all; @@all; end + # @return [Array] list of all defined bots + def self.all; @@all ||= []; end - def self.get(name) - all.find { |bot| bot.username == name } + # Fetches a bot by username + # @param username [String] + # @return [Ebooks::Bot] + def self.get(username) + all.find { |bot| bot.username == username } end + # Logs info to stdout in the context of this bot def log(*args) STDOUT.print "@#{@username}: " + args.map(&:to_s).join(' ') + "\n" STDOUT.flush end - def initialize(*args, &b) - @username ||= nil + # Initializes and configures bot + # @param args Arguments passed to configure method + # @param b Block to call with new bot + def initialize(username, &b) @blacklist ||= [] - @delay_range ||= 0 - - @users ||= {} + @userinfo ||= {} @conversations ||= {} - configure(*args, &b) - # Tweet ids we've already observed, to avoid duplication @seen_tweets ||= {} + + @username = username + configure(*args, &b) + Bot.all << self end + # Find information we've collected about a user + # @param username [String] + # @return [Ebooks::UserInfo] def userinfo(username) - @users[username] ||= UserInfo.new(username) + @userinfo[username] ||= UserInfo.new(username) end - # Grab or create the conversation context for this tweet + # Find or create the conversation context for this tweet + # @param tweet [Twitter::Tweet] + # @return [Ebooks::Conversation] def conversation(tweet) conv = if tweet.in_reply_to_status_id? @conversations[tweet.in_reply_to_status_id] @@ -175,6 +206,7 @@ module Ebooks conv end + # @return [Twitter::REST::Client] underlying REST client from twitter gem def twitter @twitter ||= Twitter::REST::Client.new do |config| config.consumer_key = @consumer_key @@ -184,6 +216,7 @@ module Ebooks end end + # @return [Twitter::Streaming::Client] underlying streaming client from twitter gem def stream @stream ||= Twitter::Streaming::Client.new do |config| config.consumer_key = @consumer_key @@ -194,11 +227,14 @@ module Ebooks end # Calculate some meta information about a tweet relevant for replying + # @param ev [Twitter::Tweet] + # @return [Ebooks::TweetMeta] def calc_meta(ev) TweetMeta.new(self, ev) end # Receive an event from the twitter stream + # @param ev [Object] Twitter streaming event def receive_event(ev) if ev.is_a? Array # Initial array sent on first connection log "Online!" @@ -250,14 +286,7 @@ module Ebooks end end - def start_stream - log "starting tweet stream" - - stream.user do |ev| - receive_event ev - end - end - + # Configures client and fires startup event def prepare # Sanity check if @username.nil? @@ -268,12 +297,18 @@ module Ebooks fire(:startup) end - # Connects to tweetstream and opens event handlers for this bot + # Start running user event stream def start - start_stream + log "starting tweet stream" + + stream.user do |ev| + receive_event ev + end end # Fire an event + # @param event [Symbol] event to fire + # @param args arguments for event handler def fire(event, *args) handler = "on_#{event}".to_sym if respond_to? handler @@ -281,11 +316,17 @@ module Ebooks end end - def delay(&b) - time = @delay.to_a.sample unless @delay.is_a? Integer + # Delay an action for a variable period of time + # @param range [Range, Integer] range of seconds to choose for delay + def delay(range=@delay_range, &b) + time = range.to_a.sample unless range.is_a? Integer sleep time + b.call end + # Check if a username is blacklisted + # @param username [String] + # @return [Boolean] def blacklisted?(username) if @blacklist.include?(username) true @@ -295,6 +336,9 @@ module Ebooks end # Reply to a tweet or a DM. + # @param ev [Twitter::Tweet, Twitter::DirectMessage] + # @param text [String] contents of reply excluding reply_prefix + # @param opts [Hash] additional params to pass to twitter gem def reply(ev, text, opts={}) opts = opts.clone @@ -306,26 +350,28 @@ module Ebooks if conversation(ev).is_bot?(ev.user.screen_name) log "Not replying to suspected bot @#{ev.user.screen_name}" - return + return false end if !meta.mentions_bot? if !userinfo(ev.user.screen_name).can_pester? log "Not replying: leaving @#{ev.user.screen_name} alone" - return + return false end end log "Replying to @#{ev.user.screen_name} with: #{meta.reply_prefix + text}" tweet = twitter.update(meta.reply_prefix + text, in_reply_to_status_id: ev.id) conversation(tweet).add(tweet) + tweet else raise Exception("Don't know how to reply to a #{ev.class}") end end + # Favorite a tweet + # @param tweet [Twitter::Tweet] def favorite(tweet) - return if blacklisted?(tweet.user.screen_name) log "Favoriting @#{tweet.user.screen_name}: #{tweet.text}" begin @@ -335,6 +381,8 @@ module Ebooks end end + # Retweet a tweet + # @param tweet [Twitter::Tweet] def retweet(tweet) log "Retweeting @#{tweet.user.screen_name}: #{tweet.text}" @@ -345,26 +393,36 @@ module Ebooks end end - def follow(*args) - log "Following #{args}" - twitter.follow(*args) + # Follow a user + # @param user [String] username or user id + def follow(user, *args) + log "Following #{user}" + twitter.follow(user, *args) end - def unfollow(*args) - log "Unfollowing #{args}" - twiter.unfollow(*args) + # Unfollow a user + # @param user [String] username or user id + def unfollow(user, *args) + log "Unfollowing #{user}" + twiter.unfollow(user, *args) end - def tweet(*args) - log "Tweeting #{args.inspect}" - twitter.update(*args) + # Tweet something + # @param text [String] + def tweet(text, *args) + log "Tweeting '#{text}'" + twitter.update(text, *args) end + # Get a scheduler for this bot + # @return [Rufus::Scheduler] def scheduler @scheduler ||= Rufus::Scheduler.new end - # could easily just be *args however the separation keeps it clean. + # Tweet some text with an image + # @param txt [String] + # @param pic [String] filename def pictweet(txt, pic, *args) log "Tweeting #{txt.inspect} - #{pic} #{args}" twitter.update_with_media(txt, File.new(pic), *args) diff --git a/lib/twitter_ebooks/markov.rb b/lib/twitter_ebooks/markov.rb deleted file mode 100644 index ed66fad..0000000 --- a/lib/twitter_ebooks/markov.rb +++ /dev/null @@ -1,82 +0,0 @@ -module Ebooks - # Special INTERIM token represents sentence boundaries - # This is so we can include start and end of statements in model - # Due to the way the sentence tokenizer works, can correspond - # to multiple actual parts of text (such as ^, $, \n and .?!) - INTERIM = :interim - - # This is an ngram-based Markov model optimized to build from a - # tokenized sentence list without requiring too much transformation - class MarkovModel - def self.build(sentences) - MarkovModel.new.consume(sentences) - end - - def consume(sentences) - # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...] - # We map by both bigrams and unigrams so we can fall back to the latter in - # cases where an input bigram is unavailable, such as starting a sentence - @sentences = sentences - @unigrams = {} - @bigrams = {} - - sentences.each_with_index do |tokens, i| - last_token = INTERIM - tokens.each_with_index do |token, j| - @unigrams[last_token] ||= [] - @unigrams[last_token] << [i, j] - - @bigrams[last_token] ||= {} - @bigrams[last_token][token] ||= [] - - if j == tokens.length-1 # Mark sentence endings - @unigrams[token] ||= [] - @unigrams[token] << INTERIM - @bigrams[last_token][token] << INTERIM - else - @bigrams[last_token][token] << [i, j+1] - end - - last_token = token - end - end - - self - end - - def find_token(index) - if index == INTERIM - INTERIM - else - @sentences[index[0]][index[1]] - end - end - - def chain(tokens) - if tokens.length == 1 - matches = @unigrams[tokens[-1]] - else - matches = @bigrams[tokens[-2]][tokens[-1]] - matches = @unigrams[tokens[-1]] if matches.length < 2 - end - - if matches.empty? - # This should never happen unless a strange token is - # supplied from outside the dataset - raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}" - end - - next_token = find_token(matches.sample) - - if next_token == INTERIM # We chose to end the sentence - return tokens - else - return chain(tokens + [next_token]) - end - end - - def generate - NLP.reconstruct(chain([INTERIM])) - end - end -end diff --git a/lib/twitter_ebooks/model.rb b/lib/twitter_ebooks/model.rb index 0f1bbad..666f78c 100644 --- a/lib/twitter_ebooks/model.rb +++ b/lib/twitter_ebooks/model.rb @@ -8,16 +8,41 @@ require 'csv' module Ebooks class Model - attr_accessor :hash, :tokens, :sentences, :mentions, :keywords + # @return [Array] + # An array of unique tokens. This is the main source of actual strings + # in the model. Manipulation of a token is done using its index + # in this array, which we call a "tiki" + attr_accessor :tokens - def self.consume(txtpath) - Model.new.consume(txtpath) + # @return [Array>] + # Sentences represented by arrays of tikis + attr_accessor :sentences + + # @return [Array>] + # Sentences derived from Twitter mentions + attr_accessor :mentions + + # @return [Array] + # The top 200 most important keywords, in descending order + attr_accessor :keywords + + # Generate a new model from a corpus file + # @param path [String] + # @return [Ebooks::Model] + def self.consume(path) + Model.new.consume(path) end + # Generate a new model from multiple corpus files + # @param paths [Array] + # @return [Ebooks::Model] def self.consume_all(paths) Model.new.consume_all(paths) end + # Load a saved model + # @param path [String] + # @return [Ebooks::Model] def self.load(path) model = Model.new model.instance_eval do @@ -30,6 +55,8 @@ module Ebooks model end + # Save model to a file + # @param path [String] def save(path) File.open(path, 'wb') do |f| f.write(Marshal.dump({ @@ -43,19 +70,22 @@ module Ebooks end def initialize - # This is the only source of actual strings in the model. It is - # an array of unique tokens. Manipulation of a token is mostly done - # using its index in this array, which we call a "tiki" @tokens = [] # Reverse lookup tiki by token, for faster generation @tikis = {} end + # Reverse lookup a token index from a token + # @param token [String] + # @return [Integer] def tikify(token) @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1) end + # Convert a body of text into arrays of tikis + # @param text [String] + # @return [Array>] def mass_tikify(text) sentences = NLP.sentences(text) @@ -69,9 +99,10 @@ module Ebooks end end + # Consume a corpus into this model + # @param path [String] def consume(path) content = File.read(path, :encoding => 'utf-8') - @hash = Digest::MD5.hexdigest(content) if path.split('.')[-1] == "json" log "Reading json corpus from #{path}" @@ -94,6 +125,8 @@ module Ebooks consume_lines(lines) end + # Consume a sequence of lines + # @param lines [Array] def consume_lines(lines) log "Removing commented lines and sorting mentions" @@ -126,11 +159,12 @@ module Ebooks self end + # Consume multiple corpuses into this model + # @param paths [Array] def consume_all(paths) lines = [] paths.each do |path| content = File.read(path, :encoding => 'utf-8') - @hash = Digest::MD5.hexdigest(content) if path.split('.')[-1] == "json" log "Reading json corpus from #{path}" @@ -156,25 +190,26 @@ module Ebooks consume_lines(lines) end - def fix(tweet) - # This seems to require an external api call - #begin - # fixer = NLP.gingerice.parse(tweet) - # log fixer if fixer['corrections'] - # tweet = fixer['result'] - #rescue Exception => e - # log e.message - # log e.backtrace - #end - - NLP.htmlentities.decode tweet + # Correct encoding issues in generated text + # @param text [String] + # @return [String] + def fix(text) + NLP.htmlentities.decode text end + # Check if an array of tikis comprises a valid tweet + # @param tikis [Array] + # @param limit Integer how many chars we have left def valid_tweet?(tikis, limit) tweet = NLP.reconstruct(tikis, @tokens) tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) end + # Generate some text + # @param limit [Integer] available characters + # @param generator [SuffixGenerator, nil] + # @param retry_limit [Integer] how many times to retry on duplicates + # @return [String] def make_statement(limit=140, generator=nil, retry_limit=10) responding = !generator.nil? generator ||= SuffixGenerator.build(@sentences) @@ -209,12 +244,17 @@ module Ebooks end # Test if a sentence has been copied verbatim from original - def verbatim?(tokens) - @sentences.include?(tokens) || @mentions.include?(tokens) + # @param tikis [Array] + # @return [Boolean] + def verbatim?(tikis) + @sentences.include?(tikis) || @mentions.include?(tikis) end - # Finds all relevant tokenized sentences to given input by + # Finds relevant and slightly relevant tokenized sentences to input # comparing non-stopword token overlaps + # @param sentences [Array>] + # @param input [String] + # @return [Array>, Array>>] def find_relevant(sentences, input) relevant = [] slightly_relevant = [] @@ -235,6 +275,10 @@ module Ebooks # Generates a response by looking for related sentences # in the corpus and building a smaller generator from these + # @param input [String] + # @param limit [Integer] characters available for response + # @param sentences [Array>] + # @return [String] def make_response(input, limit=140, sentences=@mentions) # Prefer mentions relevant, slightly_relevant = find_relevant(sentences, input) diff --git a/lib/twitter_ebooks/nlp.rb b/lib/twitter_ebooks/nlp.rb index 819ee69..541720b 100644 --- a/lib/twitter_ebooks/nlp.rb +++ b/lib/twitter_ebooks/nlp.rb @@ -12,31 +12,35 @@ module Ebooks # Some of this stuff is pretty heavy and we don't necessarily need # to be using it all of the time + # Lazily loads an array of stopwords + # Stopwords are common English words that should often be ignored + # @return [Array] def self.stopwords @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split end + # Lazily loads an array of known English nouns + # @return [Array] def self.nouns @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split end + # Lazily loads an array of known English adjectives + # @return [Array] def self.adjectives @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split end - # POS tagger + # Lazily load part-of-speech tagging library + # This can determine whether a word is being used as a noun/adjective/verb + # @return [EngTagger] def self.tagger require 'engtagger' @tagger ||= EngTagger.new end - # Gingerice text correction service - def self.gingerice - require 'gingerice' - Gingerice::Parser.new # No caching for this one - end - - # For decoding html entities + # Lazily load HTML entity decoder + # @return [HTMLEntities] def self.htmlentities require 'htmlentities' @htmlentities ||= HTMLEntities.new @@ -44,7 +48,9 @@ module Ebooks ### Utility functions - # We don't really want to deal with all this weird unicode punctuation + # Normalize some strange unicode punctuation variants + # @param text [String] + # @return [String] def self.normalize(text) htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...') end @@ -53,6 +59,8 @@ module Ebooks # We use ad hoc approach because fancy libraries do not deal # especially well with tweet formatting, and we can fake solving # the quote problem during generation + # @param text [String] + # @return [Array] def self.sentences(text) text.split(/\n+|(?<=[.?!])\s+/) end @@ -60,15 +68,23 @@ module Ebooks # Split a sentence into word-level tokens # As above, this is ad hoc because tokenization libraries # do not behave well wrt. things like emoticons and timestamps + # @param sentence [String] + # @return [Array] def self.tokenize(sentence) regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/ sentence.split(regex) end + # Get the 'stem' form of a word e.g. 'cats' -> 'cat' + # @param word [String] + # @return [String] def self.stem(word) Stemmer::stem_word(word.downcase) end + # Use highscore gem to find interesting keywords in a corpus + # @param text [String] + # @return [Highscore::Keywords] def self.keywords(text) # Preprocess to remove stopwords (highscore's blacklist is v. slow) text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ') @@ -90,7 +106,10 @@ module Ebooks text.keywords end - # Takes a list of tokens and builds a nice-looking sentence + # Builds a proper sentence from a list of tikis + # @param tikis [Array] + # @param tokens [Array] + # @return [String] def self.reconstruct(tikis, tokens) text = "" last_token = nil @@ -105,6 +124,9 @@ module Ebooks end # Determine if we need to insert a space between two tokens + # @param token1 [String] + # @param token2 [String] + # @return [Boolean] def self.space_between?(token1, token2) p1 = self.punctuation?(token1) p2 = self.punctuation?(token2) @@ -119,10 +141,16 @@ module Ebooks end end + # Is this token comprised of punctuation? + # @param token [String] + # @return [Boolean] def self.punctuation?(token) (token.chars.to_set - PUNCTUATION.chars.to_set).empty? end + # Is this token a stopword? + # @param token [String] + # @return [Boolean] def self.stopword?(token) @stopword_set ||= stopwords.map(&:downcase).to_set @stopword_set.include?(token.downcase) @@ -130,7 +158,9 @@ module Ebooks # Determine if a sample of text contains unmatched brackets or quotes # This is one of the more frequent and noticeable failure modes for - # the markov generator; we can just tell it to retry + # the generator; we can just tell it to retry + # @param text [String] + # @return [Boolean] def self.unmatched_enclosers?(text) enclosers = ['**', '""', '()', '[]', '``', "''"] enclosers.each do |pair| @@ -153,10 +183,13 @@ module Ebooks end # Determine if a2 is a subsequence of a1 + # @param a1 [Array] + # @param a2 [Array] + # @return [Boolean] def self.subseq?(a1, a2) - a1.each_index.find do |i| + !a1.each_index.find do |i| a1[i...i+a2.length] == a2 - end + end.nil? end end end diff --git a/lib/twitter_ebooks/suffix.rb b/lib/twitter_ebooks/suffix.rb index 6e09d7e..83c6f88 100644 --- a/lib/twitter_ebooks/suffix.rb +++ b/lib/twitter_ebooks/suffix.rb @@ -1,11 +1,14 @@ # encoding: utf-8 module Ebooks - # This generator uses data identical to the markov model, but + # This generator uses data identical to a markov model, but # instead of making a chain by looking up bigrams it uses the # positions to randomly replace suffixes in one sentence with # matching suffixes in another class SuffixGenerator + # Build a generator from a corpus of tikified sentences + # @param sentences [Array>] + # @return [SuffixGenerator] def self.build(sentences) SuffixGenerator.new(sentences) end @@ -39,6 +42,11 @@ module Ebooks self end + + # Generate a recombined sequence of tikis + # @param passes [Integer] number of times to recombine + # @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is) + # @return [Array] def generate(passes=5, n=:unigrams) index = rand(@sentences.length) tikis = @sentences[index] diff --git a/lib/twitter_ebooks/version.rb b/lib/twitter_ebooks/version.rb index 9456226..37ab8f8 100644 --- a/lib/twitter_ebooks/version.rb +++ b/lib/twitter_ebooks/version.rb @@ -1,3 +1,3 @@ module Ebooks - VERSION = "2.3.2" + VERSION = "3.0.0" end diff --git a/spec/bot_spec.rb b/spec/bot_spec.rb index f7d0dec..802b937 100644 --- a/spec/bot_spec.rb +++ b/spec/bot_spec.rb @@ -3,8 +3,6 @@ require 'memory_profiler' require 'tempfile' require 'timecop' -def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end - class TestBot < Ebooks::Bot attr_accessor :twitter diff --git a/twitter_ebooks.gemspec b/twitter_ebooks.gemspec index 6224cf9..2e68482 100644 --- a/twitter_ebooks.gemspec +++ b/twitter_ebooks.gemspec @@ -20,6 +20,7 @@ Gem::Specification.new do |gem| gem.add_development_dependency 'memory_profiler' gem.add_development_dependency 'timecop' gem.add_development_dependency 'pry-byebug' + gem.add_development_dependency 'yard' gem.add_runtime_dependency 'twitter', '~> 5.0' gem.add_runtime_dependency 'simple_oauth'