Lots of documentation and cleanup
This commit is contained in:
parent
efde0fd16f
commit
1977445b1c
11 changed files with 237 additions and 178 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,3 +1,5 @@
|
||||||
.*.swp
|
.*.swp
|
||||||
Gemfile.lock
|
Gemfile.lock
|
||||||
pkg
|
pkg
|
||||||
|
.yardoc
|
||||||
|
doc
|
||||||
|
|
|
@ -4,8 +4,6 @@
|
||||||
require 'twitter_ebooks'
|
require 'twitter_ebooks'
|
||||||
require 'ostruct'
|
require 'ostruct'
|
||||||
|
|
||||||
$debug = true
|
|
||||||
|
|
||||||
module Ebooks::CLI
|
module Ebooks::CLI
|
||||||
APP_PATH = Dir.pwd # XXX do some recursive thing instead
|
APP_PATH = Dir.pwd # XXX do some recursive thing instead
|
||||||
HELP = OpenStruct.new
|
HELP = OpenStruct.new
|
||||||
|
|
|
@ -15,7 +15,6 @@ end
|
||||||
|
|
||||||
require 'twitter_ebooks/nlp'
|
require 'twitter_ebooks/nlp'
|
||||||
require 'twitter_ebooks/archive'
|
require 'twitter_ebooks/archive'
|
||||||
require 'twitter_ebooks/markov'
|
|
||||||
require 'twitter_ebooks/suffix'
|
require 'twitter_ebooks/suffix'
|
||||||
require 'twitter_ebooks/model'
|
require 'twitter_ebooks/model'
|
||||||
require 'twitter_ebooks/bot'
|
require 'twitter_ebooks/bot'
|
||||||
|
|
|
@ -6,10 +6,11 @@ module Ebooks
|
||||||
class ConfigurationError < Exception
|
class ConfigurationError < Exception
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Information about a particular Twitter user we know
|
||||||
class UserInfo
|
class UserInfo
|
||||||
attr_reader :username
|
attr_reader :username
|
||||||
|
|
||||||
# number of times we've interacted with a timeline tweet, unprompted
|
# @return [Integer] how many times we can pester this user unprompted
|
||||||
attr_accessor :pesters_left
|
attr_accessor :pesters_left
|
||||||
|
|
||||||
def initialize(username)
|
def initialize(username)
|
||||||
|
@ -17,6 +18,7 @@ module Ebooks
|
||||||
@pesters_left = 1
|
@pesters_left = 1
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# @return [Boolean] true if we're allowed to pester this user
|
||||||
def can_pester?
|
def can_pester?
|
||||||
@pesters_left > 0
|
@pesters_left > 0
|
||||||
end
|
end
|
||||||
|
@ -32,6 +34,7 @@ module Ebooks
|
||||||
@last_update = Time.now
|
@last_update = Time.now
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# @param tweet [Twitter::Tweet] tweet to add
|
||||||
def add(tweet)
|
def add(tweet)
|
||||||
@tweets << tweet
|
@tweets << tweet
|
||||||
@last_update = Time.now
|
@last_update = Time.now
|
||||||
|
@ -61,14 +64,24 @@ module Ebooks
|
||||||
|
|
||||||
# Meta information about a tweet that we calculate for ourselves
|
# Meta information about a tweet that we calculate for ourselves
|
||||||
class TweetMeta
|
class TweetMeta
|
||||||
attr_accessor :mentions # array: usernames mentioned in tweet
|
# @return [Array<String>] usernames mentioned in tweet
|
||||||
attr_accessor :mentionless # string: text of tweet with mentions removed
|
attr_accessor :mentions
|
||||||
attr_accessor :reply_mentions # array: usernames to include in a reply
|
# @return [String] text of tweets with mentions removed
|
||||||
attr_accessor :reply_prefix # string: processed string to start reply with
|
attr_accessor :mentionless
|
||||||
attr_accessor :limit # integer: available room to calculate reply
|
# @return [Array<String>] usernames to include in a reply
|
||||||
|
attr_accessor :reply_mentions
|
||||||
|
# @return [String] mentions to start reply with
|
||||||
|
attr_accessor :reply_prefix
|
||||||
|
# @return [Integer] available chars for reply
|
||||||
|
attr_accessor :limit
|
||||||
|
|
||||||
attr_accessor :bot, :tweet
|
# @return [Ebooks::Bot] associated bot
|
||||||
|
attr_accessor :bot
|
||||||
|
# @return [Twitter::Tweet] associated tweet
|
||||||
|
attr_accessor :tweet
|
||||||
|
|
||||||
|
# Check whether this tweet mentions our bot
|
||||||
|
# @return [Boolean]
|
||||||
def mentions_bot?
|
def mentions_bot?
|
||||||
# To check if this is someone talking to us, ensure:
|
# To check if this is someone talking to us, ensure:
|
||||||
# - The tweet mentions list contains our username
|
# - The tweet mentions list contains our username
|
||||||
|
@ -110,47 +123,65 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
class Bot
|
class Bot
|
||||||
attr_accessor :consumer_key, :consumer_secret,
|
# @return [String] OAuth consumer key for a Twitter app
|
||||||
:access_token, :access_token_secret
|
attr_accessor :consumer_key
|
||||||
|
# @return [String] OAuth consumer secret for a Twitter app
|
||||||
attr_reader :twitter, :stream, :thread
|
attr_accessor :consumer_secret
|
||||||
|
# @return [String] OAuth access token from `ebooks auth`
|
||||||
# Configuration
|
attr_accessor :access_token
|
||||||
attr_accessor :username, :delay_range, :blacklist
|
# @return [String] OAuth access secret from `ebooks auth`
|
||||||
|
attr_accessor :access_token_secret
|
||||||
|
# @return [String] Twitter username of bot
|
||||||
|
attr_accessor :username
|
||||||
|
# @return [Array<String>] list of usernames to block on contact
|
||||||
|
attr_accessor :blacklist
|
||||||
|
# @return [Hash{String => Ebooks::Conversation}] maps tweet ids to their conversation contexts
|
||||||
attr_accessor :conversations
|
attr_accessor :conversations
|
||||||
|
# @return [Range, Integer] range of seconds to delay in delay method
|
||||||
|
attr_accessor :delay
|
||||||
|
|
||||||
@@all = [] # List of all defined bots
|
# @return [Array] list of all defined bots
|
||||||
def self.all; @@all; end
|
def self.all; @@all ||= []; end
|
||||||
|
|
||||||
def self.get(name)
|
# Fetches a bot by username
|
||||||
all.find { |bot| bot.username == name }
|
# @param username [String]
|
||||||
|
# @return [Ebooks::Bot]
|
||||||
|
def self.get(username)
|
||||||
|
all.find { |bot| bot.username == username }
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Logs info to stdout in the context of this bot
|
||||||
def log(*args)
|
def log(*args)
|
||||||
STDOUT.print "@#{@username}: " + args.map(&:to_s).join(' ') + "\n"
|
STDOUT.print "@#{@username}: " + args.map(&:to_s).join(' ') + "\n"
|
||||||
STDOUT.flush
|
STDOUT.flush
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize(*args, &b)
|
# Initializes and configures bot
|
||||||
@username ||= nil
|
# @param args Arguments passed to configure method
|
||||||
|
# @param b Block to call with new bot
|
||||||
|
def initialize(username, &b)
|
||||||
@blacklist ||= []
|
@blacklist ||= []
|
||||||
@delay_range ||= 0
|
@userinfo ||= {}
|
||||||
|
|
||||||
@users ||= {}
|
|
||||||
@conversations ||= {}
|
@conversations ||= {}
|
||||||
configure(*args, &b)
|
|
||||||
|
|
||||||
# Tweet ids we've already observed, to avoid duplication
|
# Tweet ids we've already observed, to avoid duplication
|
||||||
@seen_tweets ||= {}
|
@seen_tweets ||= {}
|
||||||
|
|
||||||
|
@username = username
|
||||||
|
configure(*args, &b)
|
||||||
|
|
||||||
Bot.all << self
|
Bot.all << self
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Find information we've collected about a user
|
||||||
|
# @param username [String]
|
||||||
|
# @return [Ebooks::UserInfo]
|
||||||
def userinfo(username)
|
def userinfo(username)
|
||||||
@users[username] ||= UserInfo.new(username)
|
@userinfo[username] ||= UserInfo.new(username)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Grab or create the conversation context for this tweet
|
# Find or create the conversation context for this tweet
|
||||||
|
# @param tweet [Twitter::Tweet]
|
||||||
|
# @return [Ebooks::Conversation]
|
||||||
def conversation(tweet)
|
def conversation(tweet)
|
||||||
conv = if tweet.in_reply_to_status_id?
|
conv = if tweet.in_reply_to_status_id?
|
||||||
@conversations[tweet.in_reply_to_status_id]
|
@conversations[tweet.in_reply_to_status_id]
|
||||||
|
@ -175,6 +206,7 @@ module Ebooks
|
||||||
conv
|
conv
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# @return [Twitter::REST::Client] underlying REST client from twitter gem
|
||||||
def twitter
|
def twitter
|
||||||
@twitter ||= Twitter::REST::Client.new do |config|
|
@twitter ||= Twitter::REST::Client.new do |config|
|
||||||
config.consumer_key = @consumer_key
|
config.consumer_key = @consumer_key
|
||||||
|
@ -184,6 +216,7 @@ module Ebooks
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# @return [Twitter::Streaming::Client] underlying streaming client from twitter gem
|
||||||
def stream
|
def stream
|
||||||
@stream ||= Twitter::Streaming::Client.new do |config|
|
@stream ||= Twitter::Streaming::Client.new do |config|
|
||||||
config.consumer_key = @consumer_key
|
config.consumer_key = @consumer_key
|
||||||
|
@ -194,11 +227,14 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
# Calculate some meta information about a tweet relevant for replying
|
# Calculate some meta information about a tweet relevant for replying
|
||||||
|
# @param ev [Twitter::Tweet]
|
||||||
|
# @return [Ebooks::TweetMeta]
|
||||||
def calc_meta(ev)
|
def calc_meta(ev)
|
||||||
TweetMeta.new(self, ev)
|
TweetMeta.new(self, ev)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Receive an event from the twitter stream
|
# Receive an event from the twitter stream
|
||||||
|
# @param ev [Object] Twitter streaming event
|
||||||
def receive_event(ev)
|
def receive_event(ev)
|
||||||
if ev.is_a? Array # Initial array sent on first connection
|
if ev.is_a? Array # Initial array sent on first connection
|
||||||
log "Online!"
|
log "Online!"
|
||||||
|
@ -250,14 +286,7 @@ module Ebooks
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def start_stream
|
# Configures client and fires startup event
|
||||||
log "starting tweet stream"
|
|
||||||
|
|
||||||
stream.user do |ev|
|
|
||||||
receive_event ev
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def prepare
|
def prepare
|
||||||
# Sanity check
|
# Sanity check
|
||||||
if @username.nil?
|
if @username.nil?
|
||||||
|
@ -268,12 +297,18 @@ module Ebooks
|
||||||
fire(:startup)
|
fire(:startup)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Connects to tweetstream and opens event handlers for this bot
|
# Start running user event stream
|
||||||
def start
|
def start
|
||||||
start_stream
|
log "starting tweet stream"
|
||||||
|
|
||||||
|
stream.user do |ev|
|
||||||
|
receive_event ev
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# Fire an event
|
# Fire an event
|
||||||
|
# @param event [Symbol] event to fire
|
||||||
|
# @param args arguments for event handler
|
||||||
def fire(event, *args)
|
def fire(event, *args)
|
||||||
handler = "on_#{event}".to_sym
|
handler = "on_#{event}".to_sym
|
||||||
if respond_to? handler
|
if respond_to? handler
|
||||||
|
@ -281,11 +316,17 @@ module Ebooks
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def delay(&b)
|
# Delay an action for a variable period of time
|
||||||
time = @delay.to_a.sample unless @delay.is_a? Integer
|
# @param range [Range, Integer] range of seconds to choose for delay
|
||||||
|
def delay(range=@delay_range, &b)
|
||||||
|
time = range.to_a.sample unless range.is_a? Integer
|
||||||
sleep time
|
sleep time
|
||||||
|
b.call
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Check if a username is blacklisted
|
||||||
|
# @param username [String]
|
||||||
|
# @return [Boolean]
|
||||||
def blacklisted?(username)
|
def blacklisted?(username)
|
||||||
if @blacklist.include?(username)
|
if @blacklist.include?(username)
|
||||||
true
|
true
|
||||||
|
@ -295,6 +336,9 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
# Reply to a tweet or a DM.
|
# Reply to a tweet or a DM.
|
||||||
|
# @param ev [Twitter::Tweet, Twitter::DirectMessage]
|
||||||
|
# @param text [String] contents of reply excluding reply_prefix
|
||||||
|
# @param opts [Hash] additional params to pass to twitter gem
|
||||||
def reply(ev, text, opts={})
|
def reply(ev, text, opts={})
|
||||||
opts = opts.clone
|
opts = opts.clone
|
||||||
|
|
||||||
|
@ -306,26 +350,28 @@ module Ebooks
|
||||||
|
|
||||||
if conversation(ev).is_bot?(ev.user.screen_name)
|
if conversation(ev).is_bot?(ev.user.screen_name)
|
||||||
log "Not replying to suspected bot @#{ev.user.screen_name}"
|
log "Not replying to suspected bot @#{ev.user.screen_name}"
|
||||||
return
|
return false
|
||||||
end
|
end
|
||||||
|
|
||||||
if !meta.mentions_bot?
|
if !meta.mentions_bot?
|
||||||
if !userinfo(ev.user.screen_name).can_pester?
|
if !userinfo(ev.user.screen_name).can_pester?
|
||||||
log "Not replying: leaving @#{ev.user.screen_name} alone"
|
log "Not replying: leaving @#{ev.user.screen_name} alone"
|
||||||
return
|
return false
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
log "Replying to @#{ev.user.screen_name} with: #{meta.reply_prefix + text}"
|
log "Replying to @#{ev.user.screen_name} with: #{meta.reply_prefix + text}"
|
||||||
tweet = twitter.update(meta.reply_prefix + text, in_reply_to_status_id: ev.id)
|
tweet = twitter.update(meta.reply_prefix + text, in_reply_to_status_id: ev.id)
|
||||||
conversation(tweet).add(tweet)
|
conversation(tweet).add(tweet)
|
||||||
|
tweet
|
||||||
else
|
else
|
||||||
raise Exception("Don't know how to reply to a #{ev.class}")
|
raise Exception("Don't know how to reply to a #{ev.class}")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Favorite a tweet
|
||||||
|
# @param tweet [Twitter::Tweet]
|
||||||
def favorite(tweet)
|
def favorite(tweet)
|
||||||
return if blacklisted?(tweet.user.screen_name)
|
|
||||||
log "Favoriting @#{tweet.user.screen_name}: #{tweet.text}"
|
log "Favoriting @#{tweet.user.screen_name}: #{tweet.text}"
|
||||||
|
|
||||||
begin
|
begin
|
||||||
|
@ -335,6 +381,8 @@ module Ebooks
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Retweet a tweet
|
||||||
|
# @param tweet [Twitter::Tweet]
|
||||||
def retweet(tweet)
|
def retweet(tweet)
|
||||||
log "Retweeting @#{tweet.user.screen_name}: #{tweet.text}"
|
log "Retweeting @#{tweet.user.screen_name}: #{tweet.text}"
|
||||||
|
|
||||||
|
@ -345,26 +393,36 @@ module Ebooks
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def follow(*args)
|
# Follow a user
|
||||||
log "Following #{args}"
|
# @param user [String] username or user id
|
||||||
twitter.follow(*args)
|
def follow(user, *args)
|
||||||
|
log "Following #{user}"
|
||||||
|
twitter.follow(user, *args)
|
||||||
end
|
end
|
||||||
|
|
||||||
def unfollow(*args)
|
# Unfollow a user
|
||||||
log "Unfollowing #{args}"
|
# @param user [String] username or user id
|
||||||
twiter.unfollow(*args)
|
def unfollow(user, *args)
|
||||||
|
log "Unfollowing #{user}"
|
||||||
|
twiter.unfollow(user, *args)
|
||||||
end
|
end
|
||||||
|
|
||||||
def tweet(*args)
|
# Tweet something
|
||||||
log "Tweeting #{args.inspect}"
|
# @param text [String]
|
||||||
twitter.update(*args)
|
def tweet(text, *args)
|
||||||
|
log "Tweeting '#{text}'"
|
||||||
|
twitter.update(text, *args)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Get a scheduler for this bot
|
||||||
|
# @return [Rufus::Scheduler]
|
||||||
def scheduler
|
def scheduler
|
||||||
@scheduler ||= Rufus::Scheduler.new
|
@scheduler ||= Rufus::Scheduler.new
|
||||||
end
|
end
|
||||||
|
|
||||||
# could easily just be *args however the separation keeps it clean.
|
# Tweet some text with an image
|
||||||
|
# @param txt [String]
|
||||||
|
# @param pic [String] filename
|
||||||
def pictweet(txt, pic, *args)
|
def pictweet(txt, pic, *args)
|
||||||
log "Tweeting #{txt.inspect} - #{pic} #{args}"
|
log "Tweeting #{txt.inspect} - #{pic} #{args}"
|
||||||
twitter.update_with_media(txt, File.new(pic), *args)
|
twitter.update_with_media(txt, File.new(pic), *args)
|
||||||
|
|
|
@ -1,82 +0,0 @@
|
||||||
module Ebooks
|
|
||||||
# Special INTERIM token represents sentence boundaries
|
|
||||||
# This is so we can include start and end of statements in model
|
|
||||||
# Due to the way the sentence tokenizer works, can correspond
|
|
||||||
# to multiple actual parts of text (such as ^, $, \n and .?!)
|
|
||||||
INTERIM = :interim
|
|
||||||
|
|
||||||
# This is an ngram-based Markov model optimized to build from a
|
|
||||||
# tokenized sentence list without requiring too much transformation
|
|
||||||
class MarkovModel
|
|
||||||
def self.build(sentences)
|
|
||||||
MarkovModel.new.consume(sentences)
|
|
||||||
end
|
|
||||||
|
|
||||||
def consume(sentences)
|
|
||||||
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
|
|
||||||
# We map by both bigrams and unigrams so we can fall back to the latter in
|
|
||||||
# cases where an input bigram is unavailable, such as starting a sentence
|
|
||||||
@sentences = sentences
|
|
||||||
@unigrams = {}
|
|
||||||
@bigrams = {}
|
|
||||||
|
|
||||||
sentences.each_with_index do |tokens, i|
|
|
||||||
last_token = INTERIM
|
|
||||||
tokens.each_with_index do |token, j|
|
|
||||||
@unigrams[last_token] ||= []
|
|
||||||
@unigrams[last_token] << [i, j]
|
|
||||||
|
|
||||||
@bigrams[last_token] ||= {}
|
|
||||||
@bigrams[last_token][token] ||= []
|
|
||||||
|
|
||||||
if j == tokens.length-1 # Mark sentence endings
|
|
||||||
@unigrams[token] ||= []
|
|
||||||
@unigrams[token] << INTERIM
|
|
||||||
@bigrams[last_token][token] << INTERIM
|
|
||||||
else
|
|
||||||
@bigrams[last_token][token] << [i, j+1]
|
|
||||||
end
|
|
||||||
|
|
||||||
last_token = token
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
self
|
|
||||||
end
|
|
||||||
|
|
||||||
def find_token(index)
|
|
||||||
if index == INTERIM
|
|
||||||
INTERIM
|
|
||||||
else
|
|
||||||
@sentences[index[0]][index[1]]
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def chain(tokens)
|
|
||||||
if tokens.length == 1
|
|
||||||
matches = @unigrams[tokens[-1]]
|
|
||||||
else
|
|
||||||
matches = @bigrams[tokens[-2]][tokens[-1]]
|
|
||||||
matches = @unigrams[tokens[-1]] if matches.length < 2
|
|
||||||
end
|
|
||||||
|
|
||||||
if matches.empty?
|
|
||||||
# This should never happen unless a strange token is
|
|
||||||
# supplied from outside the dataset
|
|
||||||
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
|
|
||||||
end
|
|
||||||
|
|
||||||
next_token = find_token(matches.sample)
|
|
||||||
|
|
||||||
if next_token == INTERIM # We chose to end the sentence
|
|
||||||
return tokens
|
|
||||||
else
|
|
||||||
return chain(tokens + [next_token])
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def generate
|
|
||||||
NLP.reconstruct(chain([INTERIM]))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -8,16 +8,41 @@ require 'csv'
|
||||||
|
|
||||||
module Ebooks
|
module Ebooks
|
||||||
class Model
|
class Model
|
||||||
attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
|
# @return [Array<String>]
|
||||||
|
# An array of unique tokens. This is the main source of actual strings
|
||||||
|
# in the model. Manipulation of a token is done using its index
|
||||||
|
# in this array, which we call a "tiki"
|
||||||
|
attr_accessor :tokens
|
||||||
|
|
||||||
def self.consume(txtpath)
|
# @return [Array<Array<Integer>>]
|
||||||
Model.new.consume(txtpath)
|
# Sentences represented by arrays of tikis
|
||||||
|
attr_accessor :sentences
|
||||||
|
|
||||||
|
# @return [Array<Array<Integer>>]
|
||||||
|
# Sentences derived from Twitter mentions
|
||||||
|
attr_accessor :mentions
|
||||||
|
|
||||||
|
# @return [Array<String>]
|
||||||
|
# The top 200 most important keywords, in descending order
|
||||||
|
attr_accessor :keywords
|
||||||
|
|
||||||
|
# Generate a new model from a corpus file
|
||||||
|
# @param path [String]
|
||||||
|
# @return [Ebooks::Model]
|
||||||
|
def self.consume(path)
|
||||||
|
Model.new.consume(path)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Generate a new model from multiple corpus files
|
||||||
|
# @param paths [Array<String>]
|
||||||
|
# @return [Ebooks::Model]
|
||||||
def self.consume_all(paths)
|
def self.consume_all(paths)
|
||||||
Model.new.consume_all(paths)
|
Model.new.consume_all(paths)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Load a saved model
|
||||||
|
# @param path [String]
|
||||||
|
# @return [Ebooks::Model]
|
||||||
def self.load(path)
|
def self.load(path)
|
||||||
model = Model.new
|
model = Model.new
|
||||||
model.instance_eval do
|
model.instance_eval do
|
||||||
|
@ -30,6 +55,8 @@ module Ebooks
|
||||||
model
|
model
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Save model to a file
|
||||||
|
# @param path [String]
|
||||||
def save(path)
|
def save(path)
|
||||||
File.open(path, 'wb') do |f|
|
File.open(path, 'wb') do |f|
|
||||||
f.write(Marshal.dump({
|
f.write(Marshal.dump({
|
||||||
|
@ -43,19 +70,22 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
def initialize
|
def initialize
|
||||||
# This is the only source of actual strings in the model. It is
|
|
||||||
# an array of unique tokens. Manipulation of a token is mostly done
|
|
||||||
# using its index in this array, which we call a "tiki"
|
|
||||||
@tokens = []
|
@tokens = []
|
||||||
|
|
||||||
# Reverse lookup tiki by token, for faster generation
|
# Reverse lookup tiki by token, for faster generation
|
||||||
@tikis = {}
|
@tikis = {}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Reverse lookup a token index from a token
|
||||||
|
# @param token [String]
|
||||||
|
# @return [Integer]
|
||||||
def tikify(token)
|
def tikify(token)
|
||||||
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
|
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Convert a body of text into arrays of tikis
|
||||||
|
# @param text [String]
|
||||||
|
# @return [Array<Array<Integer>>]
|
||||||
def mass_tikify(text)
|
def mass_tikify(text)
|
||||||
sentences = NLP.sentences(text)
|
sentences = NLP.sentences(text)
|
||||||
|
|
||||||
|
@ -69,9 +99,10 @@ module Ebooks
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Consume a corpus into this model
|
||||||
|
# @param path [String]
|
||||||
def consume(path)
|
def consume(path)
|
||||||
content = File.read(path, :encoding => 'utf-8')
|
content = File.read(path, :encoding => 'utf-8')
|
||||||
@hash = Digest::MD5.hexdigest(content)
|
|
||||||
|
|
||||||
if path.split('.')[-1] == "json"
|
if path.split('.')[-1] == "json"
|
||||||
log "Reading json corpus from #{path}"
|
log "Reading json corpus from #{path}"
|
||||||
|
@ -94,6 +125,8 @@ module Ebooks
|
||||||
consume_lines(lines)
|
consume_lines(lines)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Consume a sequence of lines
|
||||||
|
# @param lines [Array<String>]
|
||||||
def consume_lines(lines)
|
def consume_lines(lines)
|
||||||
log "Removing commented lines and sorting mentions"
|
log "Removing commented lines and sorting mentions"
|
||||||
|
|
||||||
|
@ -126,11 +159,12 @@ module Ebooks
|
||||||
self
|
self
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Consume multiple corpuses into this model
|
||||||
|
# @param paths [Array<String>]
|
||||||
def consume_all(paths)
|
def consume_all(paths)
|
||||||
lines = []
|
lines = []
|
||||||
paths.each do |path|
|
paths.each do |path|
|
||||||
content = File.read(path, :encoding => 'utf-8')
|
content = File.read(path, :encoding => 'utf-8')
|
||||||
@hash = Digest::MD5.hexdigest(content)
|
|
||||||
|
|
||||||
if path.split('.')[-1] == "json"
|
if path.split('.')[-1] == "json"
|
||||||
log "Reading json corpus from #{path}"
|
log "Reading json corpus from #{path}"
|
||||||
|
@ -156,25 +190,26 @@ module Ebooks
|
||||||
consume_lines(lines)
|
consume_lines(lines)
|
||||||
end
|
end
|
||||||
|
|
||||||
def fix(tweet)
|
# Correct encoding issues in generated text
|
||||||
# This seems to require an external api call
|
# @param text [String]
|
||||||
#begin
|
# @return [String]
|
||||||
# fixer = NLP.gingerice.parse(tweet)
|
def fix(text)
|
||||||
# log fixer if fixer['corrections']
|
NLP.htmlentities.decode text
|
||||||
# tweet = fixer['result']
|
|
||||||
#rescue Exception => e
|
|
||||||
# log e.message
|
|
||||||
# log e.backtrace
|
|
||||||
#end
|
|
||||||
|
|
||||||
NLP.htmlentities.decode tweet
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Check if an array of tikis comprises a valid tweet
|
||||||
|
# @param tikis [Array<Integer>]
|
||||||
|
# @param limit Integer how many chars we have left
|
||||||
def valid_tweet?(tikis, limit)
|
def valid_tweet?(tikis, limit)
|
||||||
tweet = NLP.reconstruct(tikis, @tokens)
|
tweet = NLP.reconstruct(tikis, @tokens)
|
||||||
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Generate some text
|
||||||
|
# @param limit [Integer] available characters
|
||||||
|
# @param generator [SuffixGenerator, nil]
|
||||||
|
# @param retry_limit [Integer] how many times to retry on duplicates
|
||||||
|
# @return [String]
|
||||||
def make_statement(limit=140, generator=nil, retry_limit=10)
|
def make_statement(limit=140, generator=nil, retry_limit=10)
|
||||||
responding = !generator.nil?
|
responding = !generator.nil?
|
||||||
generator ||= SuffixGenerator.build(@sentences)
|
generator ||= SuffixGenerator.build(@sentences)
|
||||||
|
@ -209,12 +244,17 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
# Test if a sentence has been copied verbatim from original
|
# Test if a sentence has been copied verbatim from original
|
||||||
def verbatim?(tokens)
|
# @param tikis [Array<Integer>]
|
||||||
@sentences.include?(tokens) || @mentions.include?(tokens)
|
# @return [Boolean]
|
||||||
|
def verbatim?(tikis)
|
||||||
|
@sentences.include?(tikis) || @mentions.include?(tikis)
|
||||||
end
|
end
|
||||||
|
|
||||||
# Finds all relevant tokenized sentences to given input by
|
# Finds relevant and slightly relevant tokenized sentences to input
|
||||||
# comparing non-stopword token overlaps
|
# comparing non-stopword token overlaps
|
||||||
|
# @param sentences [Array<Array<Integer>>]
|
||||||
|
# @param input [String]
|
||||||
|
# @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
|
||||||
def find_relevant(sentences, input)
|
def find_relevant(sentences, input)
|
||||||
relevant = []
|
relevant = []
|
||||||
slightly_relevant = []
|
slightly_relevant = []
|
||||||
|
@ -235,6 +275,10 @@ module Ebooks
|
||||||
|
|
||||||
# Generates a response by looking for related sentences
|
# Generates a response by looking for related sentences
|
||||||
# in the corpus and building a smaller generator from these
|
# in the corpus and building a smaller generator from these
|
||||||
|
# @param input [String]
|
||||||
|
# @param limit [Integer] characters available for response
|
||||||
|
# @param sentences [Array<Array<Integer>>]
|
||||||
|
# @return [String]
|
||||||
def make_response(input, limit=140, sentences=@mentions)
|
def make_response(input, limit=140, sentences=@mentions)
|
||||||
# Prefer mentions
|
# Prefer mentions
|
||||||
relevant, slightly_relevant = find_relevant(sentences, input)
|
relevant, slightly_relevant = find_relevant(sentences, input)
|
||||||
|
|
|
@ -12,31 +12,35 @@ module Ebooks
|
||||||
# Some of this stuff is pretty heavy and we don't necessarily need
|
# Some of this stuff is pretty heavy and we don't necessarily need
|
||||||
# to be using it all of the time
|
# to be using it all of the time
|
||||||
|
|
||||||
|
# Lazily loads an array of stopwords
|
||||||
|
# Stopwords are common English words that should often be ignored
|
||||||
|
# @return [Array<String>]
|
||||||
def self.stopwords
|
def self.stopwords
|
||||||
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Lazily loads an array of known English nouns
|
||||||
|
# @return [Array<String>]
|
||||||
def self.nouns
|
def self.nouns
|
||||||
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Lazily loads an array of known English adjectives
|
||||||
|
# @return [Array<String>]
|
||||||
def self.adjectives
|
def self.adjectives
|
||||||
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
||||||
end
|
end
|
||||||
|
|
||||||
# POS tagger
|
# Lazily load part-of-speech tagging library
|
||||||
|
# This can determine whether a word is being used as a noun/adjective/verb
|
||||||
|
# @return [EngTagger]
|
||||||
def self.tagger
|
def self.tagger
|
||||||
require 'engtagger'
|
require 'engtagger'
|
||||||
@tagger ||= EngTagger.new
|
@tagger ||= EngTagger.new
|
||||||
end
|
end
|
||||||
|
|
||||||
# Gingerice text correction service
|
# Lazily load HTML entity decoder
|
||||||
def self.gingerice
|
# @return [HTMLEntities]
|
||||||
require 'gingerice'
|
|
||||||
Gingerice::Parser.new # No caching for this one
|
|
||||||
end
|
|
||||||
|
|
||||||
# For decoding html entities
|
|
||||||
def self.htmlentities
|
def self.htmlentities
|
||||||
require 'htmlentities'
|
require 'htmlentities'
|
||||||
@htmlentities ||= HTMLEntities.new
|
@htmlentities ||= HTMLEntities.new
|
||||||
|
@ -44,7 +48,9 @@ module Ebooks
|
||||||
|
|
||||||
### Utility functions
|
### Utility functions
|
||||||
|
|
||||||
# We don't really want to deal with all this weird unicode punctuation
|
# Normalize some strange unicode punctuation variants
|
||||||
|
# @param text [String]
|
||||||
|
# @return [String]
|
||||||
def self.normalize(text)
|
def self.normalize(text)
|
||||||
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
||||||
end
|
end
|
||||||
|
@ -53,6 +59,8 @@ module Ebooks
|
||||||
# We use ad hoc approach because fancy libraries do not deal
|
# We use ad hoc approach because fancy libraries do not deal
|
||||||
# especially well with tweet formatting, and we can fake solving
|
# especially well with tweet formatting, and we can fake solving
|
||||||
# the quote problem during generation
|
# the quote problem during generation
|
||||||
|
# @param text [String]
|
||||||
|
# @return [Array<String>]
|
||||||
def self.sentences(text)
|
def self.sentences(text)
|
||||||
text.split(/\n+|(?<=[.?!])\s+/)
|
text.split(/\n+|(?<=[.?!])\s+/)
|
||||||
end
|
end
|
||||||
|
@ -60,15 +68,23 @@ module Ebooks
|
||||||
# Split a sentence into word-level tokens
|
# Split a sentence into word-level tokens
|
||||||
# As above, this is ad hoc because tokenization libraries
|
# As above, this is ad hoc because tokenization libraries
|
||||||
# do not behave well wrt. things like emoticons and timestamps
|
# do not behave well wrt. things like emoticons and timestamps
|
||||||
|
# @param sentence [String]
|
||||||
|
# @return [Array<String>]
|
||||||
def self.tokenize(sentence)
|
def self.tokenize(sentence)
|
||||||
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
||||||
sentence.split(regex)
|
sentence.split(regex)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Get the 'stem' form of a word e.g. 'cats' -> 'cat'
|
||||||
|
# @param word [String]
|
||||||
|
# @return [String]
|
||||||
def self.stem(word)
|
def self.stem(word)
|
||||||
Stemmer::stem_word(word.downcase)
|
Stemmer::stem_word(word.downcase)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Use highscore gem to find interesting keywords in a corpus
|
||||||
|
# @param text [String]
|
||||||
|
# @return [Highscore::Keywords]
|
||||||
def self.keywords(text)
|
def self.keywords(text)
|
||||||
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
||||||
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
||||||
|
@ -90,7 +106,10 @@ module Ebooks
|
||||||
text.keywords
|
text.keywords
|
||||||
end
|
end
|
||||||
|
|
||||||
# Takes a list of tokens and builds a nice-looking sentence
|
# Builds a proper sentence from a list of tikis
|
||||||
|
# @param tikis [Array<Integer>]
|
||||||
|
# @param tokens [Array<String>]
|
||||||
|
# @return [String]
|
||||||
def self.reconstruct(tikis, tokens)
|
def self.reconstruct(tikis, tokens)
|
||||||
text = ""
|
text = ""
|
||||||
last_token = nil
|
last_token = nil
|
||||||
|
@ -105,6 +124,9 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
# Determine if we need to insert a space between two tokens
|
# Determine if we need to insert a space between two tokens
|
||||||
|
# @param token1 [String]
|
||||||
|
# @param token2 [String]
|
||||||
|
# @return [Boolean]
|
||||||
def self.space_between?(token1, token2)
|
def self.space_between?(token1, token2)
|
||||||
p1 = self.punctuation?(token1)
|
p1 = self.punctuation?(token1)
|
||||||
p2 = self.punctuation?(token2)
|
p2 = self.punctuation?(token2)
|
||||||
|
@ -119,10 +141,16 @@ module Ebooks
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Is this token comprised of punctuation?
|
||||||
|
# @param token [String]
|
||||||
|
# @return [Boolean]
|
||||||
def self.punctuation?(token)
|
def self.punctuation?(token)
|
||||||
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Is this token a stopword?
|
||||||
|
# @param token [String]
|
||||||
|
# @return [Boolean]
|
||||||
def self.stopword?(token)
|
def self.stopword?(token)
|
||||||
@stopword_set ||= stopwords.map(&:downcase).to_set
|
@stopword_set ||= stopwords.map(&:downcase).to_set
|
||||||
@stopword_set.include?(token.downcase)
|
@stopword_set.include?(token.downcase)
|
||||||
|
@ -130,7 +158,9 @@ module Ebooks
|
||||||
|
|
||||||
# Determine if a sample of text contains unmatched brackets or quotes
|
# Determine if a sample of text contains unmatched brackets or quotes
|
||||||
# This is one of the more frequent and noticeable failure modes for
|
# This is one of the more frequent and noticeable failure modes for
|
||||||
# the markov generator; we can just tell it to retry
|
# the generator; we can just tell it to retry
|
||||||
|
# @param text [String]
|
||||||
|
# @return [Boolean]
|
||||||
def self.unmatched_enclosers?(text)
|
def self.unmatched_enclosers?(text)
|
||||||
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
||||||
enclosers.each do |pair|
|
enclosers.each do |pair|
|
||||||
|
@ -153,10 +183,13 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
# Determine if a2 is a subsequence of a1
|
# Determine if a2 is a subsequence of a1
|
||||||
|
# @param a1 [Array]
|
||||||
|
# @param a2 [Array]
|
||||||
|
# @return [Boolean]
|
||||||
def self.subseq?(a1, a2)
|
def self.subseq?(a1, a2)
|
||||||
a1.each_index.find do |i|
|
!a1.each_index.find do |i|
|
||||||
a1[i...i+a2.length] == a2
|
a1[i...i+a2.length] == a2
|
||||||
end
|
end.nil?
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
# encoding: utf-8
|
# encoding: utf-8
|
||||||
|
|
||||||
module Ebooks
|
module Ebooks
|
||||||
# This generator uses data identical to the markov model, but
|
# This generator uses data identical to a markov model, but
|
||||||
# instead of making a chain by looking up bigrams it uses the
|
# instead of making a chain by looking up bigrams it uses the
|
||||||
# positions to randomly replace suffixes in one sentence with
|
# positions to randomly replace suffixes in one sentence with
|
||||||
# matching suffixes in another
|
# matching suffixes in another
|
||||||
class SuffixGenerator
|
class SuffixGenerator
|
||||||
|
# Build a generator from a corpus of tikified sentences
|
||||||
|
# @param sentences [Array<Array<Integer>>]
|
||||||
|
# @return [SuffixGenerator]
|
||||||
def self.build(sentences)
|
def self.build(sentences)
|
||||||
SuffixGenerator.new(sentences)
|
SuffixGenerator.new(sentences)
|
||||||
end
|
end
|
||||||
|
@ -39,6 +42,11 @@ module Ebooks
|
||||||
self
|
self
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Generate a recombined sequence of tikis
|
||||||
|
# @param passes [Integer] number of times to recombine
|
||||||
|
# @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
|
||||||
|
# @return [Array<Integer>]
|
||||||
def generate(passes=5, n=:unigrams)
|
def generate(passes=5, n=:unigrams)
|
||||||
index = rand(@sentences.length)
|
index = rand(@sentences.length)
|
||||||
tikis = @sentences[index]
|
tikis = @sentences[index]
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
module Ebooks
|
module Ebooks
|
||||||
VERSION = "2.3.2"
|
VERSION = "3.0.0"
|
||||||
end
|
end
|
||||||
|
|
|
@ -3,8 +3,6 @@ require 'memory_profiler'
|
||||||
require 'tempfile'
|
require 'tempfile'
|
||||||
require 'timecop'
|
require 'timecop'
|
||||||
|
|
||||||
def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
|
|
||||||
|
|
||||||
class TestBot < Ebooks::Bot
|
class TestBot < Ebooks::Bot
|
||||||
attr_accessor :twitter
|
attr_accessor :twitter
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,7 @@ Gem::Specification.new do |gem|
|
||||||
gem.add_development_dependency 'memory_profiler'
|
gem.add_development_dependency 'memory_profiler'
|
||||||
gem.add_development_dependency 'timecop'
|
gem.add_development_dependency 'timecop'
|
||||||
gem.add_development_dependency 'pry-byebug'
|
gem.add_development_dependency 'pry-byebug'
|
||||||
|
gem.add_development_dependency 'yard'
|
||||||
|
|
||||||
gem.add_runtime_dependency 'twitter', '~> 5.0'
|
gem.add_runtime_dependency 'twitter', '~> 5.0'
|
||||||
gem.add_runtime_dependency 'simple_oauth'
|
gem.add_runtime_dependency 'simple_oauth'
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue