Lots of documentation and cleanup

This commit is contained in:
Jaiden Mispy 2014-12-05 21:12:39 +11:00
parent efde0fd16f
commit 1977445b1c
11 changed files with 237 additions and 178 deletions

2
.gitignore vendored
View file

@ -1,3 +1,5 @@
.*.swp
Gemfile.lock
pkg
.yardoc
doc

View file

@ -4,8 +4,6 @@
require 'twitter_ebooks'
require 'ostruct'
$debug = true
module Ebooks::CLI
APP_PATH = Dir.pwd # XXX do some recursive thing instead
HELP = OpenStruct.new

View file

@ -15,7 +15,6 @@ end
require 'twitter_ebooks/nlp'
require 'twitter_ebooks/archive'
require 'twitter_ebooks/markov'
require 'twitter_ebooks/suffix'
require 'twitter_ebooks/model'
require 'twitter_ebooks/bot'

View file

@ -6,10 +6,11 @@ module Ebooks
class ConfigurationError < Exception
end
# Information about a particular Twitter user we know
class UserInfo
attr_reader :username
# number of times we've interacted with a timeline tweet, unprompted
# @return [Integer] how many times we can pester this user unprompted
attr_accessor :pesters_left
def initialize(username)
@ -17,6 +18,7 @@ module Ebooks
@pesters_left = 1
end
# @return [Boolean] true if we're allowed to pester this user
def can_pester?
@pesters_left > 0
end
@ -32,6 +34,7 @@ module Ebooks
@last_update = Time.now
end
# @param tweet [Twitter::Tweet] tweet to add
def add(tweet)
@tweets << tweet
@last_update = Time.now
@ -61,14 +64,24 @@ module Ebooks
# Meta information about a tweet that we calculate for ourselves
class TweetMeta
attr_accessor :mentions # array: usernames mentioned in tweet
attr_accessor :mentionless # string: text of tweet with mentions removed
attr_accessor :reply_mentions # array: usernames to include in a reply
attr_accessor :reply_prefix # string: processed string to start reply with
attr_accessor :limit # integer: available room to calculate reply
# @return [Array<String>] usernames mentioned in tweet
attr_accessor :mentions
# @return [String] text of tweets with mentions removed
attr_accessor :mentionless
# @return [Array<String>] usernames to include in a reply
attr_accessor :reply_mentions
# @return [String] mentions to start reply with
attr_accessor :reply_prefix
# @return [Integer] available chars for reply
attr_accessor :limit
attr_accessor :bot, :tweet
# @return [Ebooks::Bot] associated bot
attr_accessor :bot
# @return [Twitter::Tweet] associated tweet
attr_accessor :tweet
# Check whether this tweet mentions our bot
# @return [Boolean]
def mentions_bot?
# To check if this is someone talking to us, ensure:
# - The tweet mentions list contains our username
@ -110,47 +123,65 @@ module Ebooks
end
class Bot
attr_accessor :consumer_key, :consumer_secret,
:access_token, :access_token_secret
attr_reader :twitter, :stream, :thread
# Configuration
attr_accessor :username, :delay_range, :blacklist
# @return [String] OAuth consumer key for a Twitter app
attr_accessor :consumer_key
# @return [String] OAuth consumer secret for a Twitter app
attr_accessor :consumer_secret
# @return [String] OAuth access token from `ebooks auth`
attr_accessor :access_token
# @return [String] OAuth access secret from `ebooks auth`
attr_accessor :access_token_secret
# @return [String] Twitter username of bot
attr_accessor :username
# @return [Array<String>] list of usernames to block on contact
attr_accessor :blacklist
# @return [Hash{String => Ebooks::Conversation}] maps tweet ids to their conversation contexts
attr_accessor :conversations
# @return [Range, Integer] range of seconds to delay in delay method
attr_accessor :delay
@@all = [] # List of all defined bots
def self.all; @@all; end
# @return [Array] list of all defined bots
def self.all; @@all ||= []; end
def self.get(name)
all.find { |bot| bot.username == name }
# Fetches a bot by username
# @param username [String]
# @return [Ebooks::Bot]
def self.get(username)
all.find { |bot| bot.username == username }
end
# Logs info to stdout in the context of this bot
def log(*args)
STDOUT.print "@#{@username}: " + args.map(&:to_s).join(' ') + "\n"
STDOUT.flush
end
def initialize(*args, &b)
@username ||= nil
# Initializes and configures bot
# @param args Arguments passed to configure method
# @param b Block to call with new bot
def initialize(username, &b)
@blacklist ||= []
@delay_range ||= 0
@users ||= {}
@userinfo ||= {}
@conversations ||= {}
configure(*args, &b)
# Tweet ids we've already observed, to avoid duplication
@seen_tweets ||= {}
@username = username
configure(*args, &b)
Bot.all << self
end
# Find information we've collected about a user
# @param username [String]
# @return [Ebooks::UserInfo]
def userinfo(username)
@users[username] ||= UserInfo.new(username)
@userinfo[username] ||= UserInfo.new(username)
end
# Grab or create the conversation context for this tweet
# Find or create the conversation context for this tweet
# @param tweet [Twitter::Tweet]
# @return [Ebooks::Conversation]
def conversation(tweet)
conv = if tweet.in_reply_to_status_id?
@conversations[tweet.in_reply_to_status_id]
@ -175,6 +206,7 @@ module Ebooks
conv
end
# @return [Twitter::REST::Client] underlying REST client from twitter gem
def twitter
@twitter ||= Twitter::REST::Client.new do |config|
config.consumer_key = @consumer_key
@ -184,6 +216,7 @@ module Ebooks
end
end
# @return [Twitter::Streaming::Client] underlying streaming client from twitter gem
def stream
@stream ||= Twitter::Streaming::Client.new do |config|
config.consumer_key = @consumer_key
@ -194,11 +227,14 @@ module Ebooks
end
# Calculate some meta information about a tweet relevant for replying
# @param ev [Twitter::Tweet]
# @return [Ebooks::TweetMeta]
def calc_meta(ev)
TweetMeta.new(self, ev)
end
# Receive an event from the twitter stream
# @param ev [Object] Twitter streaming event
def receive_event(ev)
if ev.is_a? Array # Initial array sent on first connection
log "Online!"
@ -250,14 +286,7 @@ module Ebooks
end
end
def start_stream
log "starting tweet stream"
stream.user do |ev|
receive_event ev
end
end
# Configures client and fires startup event
def prepare
# Sanity check
if @username.nil?
@ -268,12 +297,18 @@ module Ebooks
fire(:startup)
end
# Connects to tweetstream and opens event handlers for this bot
# Start running user event stream
def start
start_stream
log "starting tweet stream"
stream.user do |ev|
receive_event ev
end
end
# Fire an event
# @param event [Symbol] event to fire
# @param args arguments for event handler
def fire(event, *args)
handler = "on_#{event}".to_sym
if respond_to? handler
@ -281,11 +316,17 @@ module Ebooks
end
end
def delay(&b)
time = @delay.to_a.sample unless @delay.is_a? Integer
# Delay an action for a variable period of time
# @param range [Range, Integer] range of seconds to choose for delay
def delay(range=@delay_range, &b)
time = range.to_a.sample unless range.is_a? Integer
sleep time
b.call
end
# Check if a username is blacklisted
# @param username [String]
# @return [Boolean]
def blacklisted?(username)
if @blacklist.include?(username)
true
@ -295,6 +336,9 @@ module Ebooks
end
# Reply to a tweet or a DM.
# @param ev [Twitter::Tweet, Twitter::DirectMessage]
# @param text [String] contents of reply excluding reply_prefix
# @param opts [Hash] additional params to pass to twitter gem
def reply(ev, text, opts={})
opts = opts.clone
@ -306,26 +350,28 @@ module Ebooks
if conversation(ev).is_bot?(ev.user.screen_name)
log "Not replying to suspected bot @#{ev.user.screen_name}"
return
return false
end
if !meta.mentions_bot?
if !userinfo(ev.user.screen_name).can_pester?
log "Not replying: leaving @#{ev.user.screen_name} alone"
return
return false
end
end
log "Replying to @#{ev.user.screen_name} with: #{meta.reply_prefix + text}"
tweet = twitter.update(meta.reply_prefix + text, in_reply_to_status_id: ev.id)
conversation(tweet).add(tweet)
tweet
else
raise Exception("Don't know how to reply to a #{ev.class}")
end
end
# Favorite a tweet
# @param tweet [Twitter::Tweet]
def favorite(tweet)
return if blacklisted?(tweet.user.screen_name)
log "Favoriting @#{tweet.user.screen_name}: #{tweet.text}"
begin
@ -335,6 +381,8 @@ module Ebooks
end
end
# Retweet a tweet
# @param tweet [Twitter::Tweet]
def retweet(tweet)
log "Retweeting @#{tweet.user.screen_name}: #{tweet.text}"
@ -345,26 +393,36 @@ module Ebooks
end
end
def follow(*args)
log "Following #{args}"
twitter.follow(*args)
# Follow a user
# @param user [String] username or user id
def follow(user, *args)
log "Following #{user}"
twitter.follow(user, *args)
end
def unfollow(*args)
log "Unfollowing #{args}"
twiter.unfollow(*args)
# Unfollow a user
# @param user [String] username or user id
def unfollow(user, *args)
log "Unfollowing #{user}"
twiter.unfollow(user, *args)
end
def tweet(*args)
log "Tweeting #{args.inspect}"
twitter.update(*args)
# Tweet something
# @param text [String]
def tweet(text, *args)
log "Tweeting '#{text}'"
twitter.update(text, *args)
end
# Get a scheduler for this bot
# @return [Rufus::Scheduler]
def scheduler
@scheduler ||= Rufus::Scheduler.new
end
# could easily just be *args however the separation keeps it clean.
# Tweet some text with an image
# @param txt [String]
# @param pic [String] filename
def pictweet(txt, pic, *args)
log "Tweeting #{txt.inspect} - #{pic} #{args}"
twitter.update_with_media(txt, File.new(pic), *args)

View file

@ -1,82 +0,0 @@
module Ebooks
# Special INTERIM token represents sentence boundaries
# This is so we can include start and end of statements in model
# Due to the way the sentence tokenizer works, can correspond
# to multiple actual parts of text (such as ^, $, \n and .?!)
INTERIM = :interim
# This is an ngram-based Markov model optimized to build from a
# tokenized sentence list without requiring too much transformation
class MarkovModel
def self.build(sentences)
MarkovModel.new.consume(sentences)
end
def consume(sentences)
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
# We map by both bigrams and unigrams so we can fall back to the latter in
# cases where an input bigram is unavailable, such as starting a sentence
@sentences = sentences
@unigrams = {}
@bigrams = {}
sentences.each_with_index do |tokens, i|
last_token = INTERIM
tokens.each_with_index do |token, j|
@unigrams[last_token] ||= []
@unigrams[last_token] << [i, j]
@bigrams[last_token] ||= {}
@bigrams[last_token][token] ||= []
if j == tokens.length-1 # Mark sentence endings
@unigrams[token] ||= []
@unigrams[token] << INTERIM
@bigrams[last_token][token] << INTERIM
else
@bigrams[last_token][token] << [i, j+1]
end
last_token = token
end
end
self
end
def find_token(index)
if index == INTERIM
INTERIM
else
@sentences[index[0]][index[1]]
end
end
def chain(tokens)
if tokens.length == 1
matches = @unigrams[tokens[-1]]
else
matches = @bigrams[tokens[-2]][tokens[-1]]
matches = @unigrams[tokens[-1]] if matches.length < 2
end
if matches.empty?
# This should never happen unless a strange token is
# supplied from outside the dataset
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
end
next_token = find_token(matches.sample)
if next_token == INTERIM # We chose to end the sentence
return tokens
else
return chain(tokens + [next_token])
end
end
def generate
NLP.reconstruct(chain([INTERIM]))
end
end
end

View file

@ -8,16 +8,41 @@ require 'csv'
module Ebooks
class Model
attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
# @return [Array<String>]
# An array of unique tokens. This is the main source of actual strings
# in the model. Manipulation of a token is done using its index
# in this array, which we call a "tiki"
attr_accessor :tokens
def self.consume(txtpath)
Model.new.consume(txtpath)
# @return [Array<Array<Integer>>]
# Sentences represented by arrays of tikis
attr_accessor :sentences
# @return [Array<Array<Integer>>]
# Sentences derived from Twitter mentions
attr_accessor :mentions
# @return [Array<String>]
# The top 200 most important keywords, in descending order
attr_accessor :keywords
# Generate a new model from a corpus file
# @param path [String]
# @return [Ebooks::Model]
def self.consume(path)
Model.new.consume(path)
end
# Generate a new model from multiple corpus files
# @param paths [Array<String>]
# @return [Ebooks::Model]
def self.consume_all(paths)
Model.new.consume_all(paths)
end
# Load a saved model
# @param path [String]
# @return [Ebooks::Model]
def self.load(path)
model = Model.new
model.instance_eval do
@ -30,6 +55,8 @@ module Ebooks
model
end
# Save model to a file
# @param path [String]
def save(path)
File.open(path, 'wb') do |f|
f.write(Marshal.dump({
@ -43,19 +70,22 @@ module Ebooks
end
def initialize
# This is the only source of actual strings in the model. It is
# an array of unique tokens. Manipulation of a token is mostly done
# using its index in this array, which we call a "tiki"
@tokens = []
# Reverse lookup tiki by token, for faster generation
@tikis = {}
end
# Reverse lookup a token index from a token
# @param token [String]
# @return [Integer]
def tikify(token)
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
end
# Convert a body of text into arrays of tikis
# @param text [String]
# @return [Array<Array<Integer>>]
def mass_tikify(text)
sentences = NLP.sentences(text)
@ -69,9 +99,10 @@ module Ebooks
end
end
# Consume a corpus into this model
# @param path [String]
def consume(path)
content = File.read(path, :encoding => 'utf-8')
@hash = Digest::MD5.hexdigest(content)
if path.split('.')[-1] == "json"
log "Reading json corpus from #{path}"
@ -94,6 +125,8 @@ module Ebooks
consume_lines(lines)
end
# Consume a sequence of lines
# @param lines [Array<String>]
def consume_lines(lines)
log "Removing commented lines and sorting mentions"
@ -126,11 +159,12 @@ module Ebooks
self
end
# Consume multiple corpuses into this model
# @param paths [Array<String>]
def consume_all(paths)
lines = []
paths.each do |path|
content = File.read(path, :encoding => 'utf-8')
@hash = Digest::MD5.hexdigest(content)
if path.split('.')[-1] == "json"
log "Reading json corpus from #{path}"
@ -156,25 +190,26 @@ module Ebooks
consume_lines(lines)
end
def fix(tweet)
# This seems to require an external api call
#begin
# fixer = NLP.gingerice.parse(tweet)
# log fixer if fixer['corrections']
# tweet = fixer['result']
#rescue Exception => e
# log e.message
# log e.backtrace
#end
NLP.htmlentities.decode tweet
# Correct encoding issues in generated text
# @param text [String]
# @return [String]
def fix(text)
NLP.htmlentities.decode text
end
# Check if an array of tikis comprises a valid tweet
# @param tikis [Array<Integer>]
# @param limit Integer how many chars we have left
def valid_tweet?(tikis, limit)
tweet = NLP.reconstruct(tikis, @tokens)
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end
# Generate some text
# @param limit [Integer] available characters
# @param generator [SuffixGenerator, nil]
# @param retry_limit [Integer] how many times to retry on duplicates
# @return [String]
def make_statement(limit=140, generator=nil, retry_limit=10)
responding = !generator.nil?
generator ||= SuffixGenerator.build(@sentences)
@ -209,12 +244,17 @@ module Ebooks
end
# Test if a sentence has been copied verbatim from original
def verbatim?(tokens)
@sentences.include?(tokens) || @mentions.include?(tokens)
# @param tikis [Array<Integer>]
# @return [Boolean]
def verbatim?(tikis)
@sentences.include?(tikis) || @mentions.include?(tikis)
end
# Finds all relevant tokenized sentences to given input by
# Finds relevant and slightly relevant tokenized sentences to input
# comparing non-stopword token overlaps
# @param sentences [Array<Array<Integer>>]
# @param input [String]
# @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
def find_relevant(sentences, input)
relevant = []
slightly_relevant = []
@ -235,6 +275,10 @@ module Ebooks
# Generates a response by looking for related sentences
# in the corpus and building a smaller generator from these
# @param input [String]
# @param limit [Integer] characters available for response
# @param sentences [Array<Array<Integer>>]
# @return [String]
def make_response(input, limit=140, sentences=@mentions)
# Prefer mentions
relevant, slightly_relevant = find_relevant(sentences, input)

View file

@ -12,31 +12,35 @@ module Ebooks
# Some of this stuff is pretty heavy and we don't necessarily need
# to be using it all of the time
# Lazily loads an array of stopwords
# Stopwords are common English words that should often be ignored
# @return [Array<String>]
def self.stopwords
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
end
# Lazily loads an array of known English nouns
# @return [Array<String>]
def self.nouns
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
end
# Lazily loads an array of known English adjectives
# @return [Array<String>]
def self.adjectives
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
end
# POS tagger
# Lazily load part-of-speech tagging library
# This can determine whether a word is being used as a noun/adjective/verb
# @return [EngTagger]
def self.tagger
require 'engtagger'
@tagger ||= EngTagger.new
end
# Gingerice text correction service
def self.gingerice
require 'gingerice'
Gingerice::Parser.new # No caching for this one
end
# For decoding html entities
# Lazily load HTML entity decoder
# @return [HTMLEntities]
def self.htmlentities
require 'htmlentities'
@htmlentities ||= HTMLEntities.new
@ -44,7 +48,9 @@ module Ebooks
### Utility functions
# We don't really want to deal with all this weird unicode punctuation
# Normalize some strange unicode punctuation variants
# @param text [String]
# @return [String]
def self.normalize(text)
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('', "'").gsub('…', '...')
end
@ -53,6 +59,8 @@ module Ebooks
# We use ad hoc approach because fancy libraries do not deal
# especially well with tweet formatting, and we can fake solving
# the quote problem during generation
# @param text [String]
# @return [Array<String>]
def self.sentences(text)
text.split(/\n+|(?<=[.?!])\s+/)
end
@ -60,15 +68,23 @@ module Ebooks
# Split a sentence into word-level tokens
# As above, this is ad hoc because tokenization libraries
# do not behave well wrt. things like emoticons and timestamps
# @param sentence [String]
# @return [Array<String>]
def self.tokenize(sentence)
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
sentence.split(regex)
end
# Get the 'stem' form of a word e.g. 'cats' -> 'cat'
# @param word [String]
# @return [String]
def self.stem(word)
Stemmer::stem_word(word.downcase)
end
# Use highscore gem to find interesting keywords in a corpus
# @param text [String]
# @return [Highscore::Keywords]
def self.keywords(text)
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
@ -90,7 +106,10 @@ module Ebooks
text.keywords
end
# Takes a list of tokens and builds a nice-looking sentence
# Builds a proper sentence from a list of tikis
# @param tikis [Array<Integer>]
# @param tokens [Array<String>]
# @return [String]
def self.reconstruct(tikis, tokens)
text = ""
last_token = nil
@ -105,6 +124,9 @@ module Ebooks
end
# Determine if we need to insert a space between two tokens
# @param token1 [String]
# @param token2 [String]
# @return [Boolean]
def self.space_between?(token1, token2)
p1 = self.punctuation?(token1)
p2 = self.punctuation?(token2)
@ -119,10 +141,16 @@ module Ebooks
end
end
# Is this token comprised of punctuation?
# @param token [String]
# @return [Boolean]
def self.punctuation?(token)
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
end
# Is this token a stopword?
# @param token [String]
# @return [Boolean]
def self.stopword?(token)
@stopword_set ||= stopwords.map(&:downcase).to_set
@stopword_set.include?(token.downcase)
@ -130,7 +158,9 @@ module Ebooks
# Determine if a sample of text contains unmatched brackets or quotes
# This is one of the more frequent and noticeable failure modes for
# the markov generator; we can just tell it to retry
# the generator; we can just tell it to retry
# @param text [String]
# @return [Boolean]
def self.unmatched_enclosers?(text)
enclosers = ['**', '""', '()', '[]', '``', "''"]
enclosers.each do |pair|
@ -153,10 +183,13 @@ module Ebooks
end
# Determine if a2 is a subsequence of a1
# @param a1 [Array]
# @param a2 [Array]
# @return [Boolean]
def self.subseq?(a1, a2)
a1.each_index.find do |i|
!a1.each_index.find do |i|
a1[i...i+a2.length] == a2
end
end.nil?
end
end
end

View file

@ -1,11 +1,14 @@
# encoding: utf-8
module Ebooks
# This generator uses data identical to the markov model, but
# This generator uses data identical to a markov model, but
# instead of making a chain by looking up bigrams it uses the
# positions to randomly replace suffixes in one sentence with
# matching suffixes in another
class SuffixGenerator
# Build a generator from a corpus of tikified sentences
# @param sentences [Array<Array<Integer>>]
# @return [SuffixGenerator]
def self.build(sentences)
SuffixGenerator.new(sentences)
end
@ -39,6 +42,11 @@ module Ebooks
self
end
# Generate a recombined sequence of tikis
# @param passes [Integer] number of times to recombine
# @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
# @return [Array<Integer>]
def generate(passes=5, n=:unigrams)
index = rand(@sentences.length)
tikis = @sentences[index]

View file

@ -1,3 +1,3 @@
module Ebooks
VERSION = "2.3.2"
VERSION = "3.0.0"
end

View file

@ -3,8 +3,6 @@ require 'memory_profiler'
require 'tempfile'
require 'timecop'
def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
class TestBot < Ebooks::Bot
attr_accessor :twitter

View file

@ -20,6 +20,7 @@ Gem::Specification.new do |gem|
gem.add_development_dependency 'memory_profiler'
gem.add_development_dependency 'timecop'
gem.add_development_dependency 'pry-byebug'
gem.add_development_dependency 'yard'
gem.add_runtime_dependency 'twitter', '~> 5.0'
gem.add_runtime_dependency 'simple_oauth'