Github time!
This commit is contained in:
commit
e87dc5862b
27 changed files with 20178 additions and 0 deletions
20
lib/twitter_ebooks.rb
Normal file
20
lib/twitter_ebooks.rb
Normal file
|
@ -0,0 +1,20 @@
|
|||
gem 'minitest'
|
||||
|
||||
def log(*args)
|
||||
STDERR.puts args.map(&:to_s).join(' ')
|
||||
STDERR.flush
|
||||
end
|
||||
|
||||
module Ebooks
|
||||
GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
||||
DATA_PATH = File.join(GEM_PATH, 'data')
|
||||
SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
|
||||
TEST_PATH = File.join(GEM_PATH, 'test')
|
||||
TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
|
||||
end
|
||||
|
||||
require 'twitter_ebooks/nlp'
|
||||
require 'twitter_ebooks/archiver'
|
||||
require 'twitter_ebooks/markov'
|
||||
require 'twitter_ebooks/model'
|
||||
require 'twitter_ebooks/bot'
|
82
lib/twitter_ebooks/archiver.rb
Normal file
82
lib/twitter_ebooks/archiver.rb
Normal file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env ruby
|
||||
# encoding: utf-8
|
||||
|
||||
require 'twitter'
|
||||
|
||||
module Ebooks
|
||||
class Archiver
|
||||
def initialize(username, outpath)
|
||||
@username = username
|
||||
@outpath = outpath
|
||||
@client = Twitter::Client.new
|
||||
end
|
||||
|
||||
# Read exiting corpus into memory.
|
||||
# Return list of tweet lines and the last tweet id.
|
||||
def read_corpus
|
||||
lines = []
|
||||
since_id = nil
|
||||
|
||||
if File.exists?(@outpath)
|
||||
lines = File.read(@outpath).split("\n")
|
||||
if lines[0].start_with?('#')
|
||||
since_id = lines[0].split('# ').last
|
||||
end
|
||||
end
|
||||
|
||||
[lines, since_id]
|
||||
end
|
||||
|
||||
# Retrieve all available tweets for a given user since the last tweet id
|
||||
def tweets_since(since_id)
|
||||
page = 1
|
||||
retries = 0
|
||||
tweets = []
|
||||
max_id = nil
|
||||
|
||||
opts = {
|
||||
count: 200,
|
||||
include_rts: false,
|
||||
trim_user: true
|
||||
}
|
||||
|
||||
opts[:since_id] = since_id unless since_id.nil?
|
||||
|
||||
loop do
|
||||
opts[:max_id] = max_id unless max_id.nil?
|
||||
new = @client.user_timeline(@username, opts)
|
||||
break if new.length <= 1
|
||||
puts "Received #{new.length} tweets"
|
||||
tweets += new
|
||||
max_id = new.last.id
|
||||
break
|
||||
end
|
||||
|
||||
tweets
|
||||
end
|
||||
|
||||
def fetch_tweets
|
||||
lines, since_id = read_corpus
|
||||
|
||||
if since_id.nil?
|
||||
puts "Retrieving tweets from @#{@username}"
|
||||
else
|
||||
puts "Retrieving tweets from @#{@username} since #{since_id}"
|
||||
end
|
||||
|
||||
tweets = tweets_since(since_id)
|
||||
|
||||
if tweets.length == 0
|
||||
puts "No new tweets"
|
||||
return
|
||||
end
|
||||
|
||||
new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
|
||||
new_since_id = tweets[0].id.to_s
|
||||
lines = ["# " + new_since_id] + new_lines + lines
|
||||
corpus = File.open(@outpath, 'w')
|
||||
corpus.write(lines.join("\n"))
|
||||
corpus.close
|
||||
end
|
||||
end
|
||||
end
|
164
lib/twitter_ebooks/bot.rb
Normal file
164
lib/twitter_ebooks/bot.rb
Normal file
|
@ -0,0 +1,164 @@
|
|||
#!/usr/bin/env ruby
|
||||
require 'twitter'
|
||||
require 'tweetstream'
|
||||
require 'rufus/scheduler'
|
||||
|
||||
module Ebooks
|
||||
class Bot
|
||||
attr_accessor :consumer_key, :consumer_secret,
|
||||
:oauth_token, :oauth_token_secret
|
||||
|
||||
attr_accessor :username
|
||||
|
||||
attr_reader :twitter, :stream
|
||||
|
||||
@@all = [] # List of all defined bots
|
||||
def self.all; @@all; end
|
||||
|
||||
def self.get(name)
|
||||
all.find { |bot| bot.username == name }
|
||||
end
|
||||
|
||||
def initialize(username, &b)
|
||||
# Set defaults
|
||||
@username = username
|
||||
|
||||
# Override with callback
|
||||
b.call(self)
|
||||
|
||||
Bot.all.push(self)
|
||||
end
|
||||
|
||||
def log(*args)
|
||||
STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
|
||||
STDERR.flush
|
||||
end
|
||||
|
||||
def configure
|
||||
TweetStream.configure do |config|
|
||||
config.consumer_key = @consumer_key
|
||||
config.consumer_secret = @consumer_secret
|
||||
config.oauth_token = @oauth_token
|
||||
config.oauth_token_secret = @oauth_token_secret
|
||||
end
|
||||
|
||||
Twitter.configure do |config|
|
||||
config.consumer_key = @consumer_key
|
||||
config.consumer_secret = @consumer_secret
|
||||
config.oauth_token = @oauth_token
|
||||
config.oauth_token_secret = @oauth_token_secret
|
||||
end
|
||||
|
||||
@twitter = Twitter::Client.new
|
||||
@stream = TweetStream::Client.new
|
||||
end
|
||||
|
||||
# Connects to tweetstream and opens event handlers for this bot
|
||||
def start
|
||||
configure
|
||||
|
||||
@on_startup.call if @on_startup
|
||||
|
||||
@stream.on_error do |msg|
|
||||
log "ERROR: #{msg}"
|
||||
end
|
||||
|
||||
@stream.on_inited do
|
||||
log "Online!"
|
||||
end
|
||||
|
||||
@stream.on_event(:follow) do |event|
|
||||
next if event[:source][:screen_name] == @username
|
||||
log "Followed by #{event[:source][:screen_name]}"
|
||||
@on_follow.call(event[:source])
|
||||
end
|
||||
|
||||
@stream.on_direct_message do |dm|
|
||||
next if dm[:sender][:screen_name] == @username # Don't reply to self
|
||||
log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
|
||||
@on_message.call(dm)
|
||||
end
|
||||
|
||||
@stream.userstream do |ev|
|
||||
next unless ev[:text] # If it's not a text-containing tweet, ignore it
|
||||
next if ev[:user][:screen_name] == @username # Ignore our own tweets
|
||||
|
||||
meta = {}
|
||||
mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
|
||||
|
||||
reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
|
||||
reply_mentions = [ev[:user][:screen_name]] + reply_mentions
|
||||
|
||||
meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
|
||||
meta[:limit] = 140 - meta[:reply_prefix].length
|
||||
|
||||
mless = ev[:text]
|
||||
begin
|
||||
ev.attrs[:entities][:user_mentions].reverse.each do |entity|
|
||||
mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]...-1]
|
||||
end
|
||||
rescue Exception
|
||||
p ev.attrs[:entities][:user_mentions]
|
||||
p ev[:text]
|
||||
raise
|
||||
end
|
||||
meta[:mentionless] = mless
|
||||
|
||||
# To check if this is a mention, ensure:
|
||||
# - The tweet mentions list contains our username
|
||||
# - The tweet is not being retweeted by somebody else
|
||||
# - Or soft-retweeted by somebody else
|
||||
if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
|
||||
log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
|
||||
@on_mention.call(ev, meta)
|
||||
else
|
||||
@on_timeline.call(ev, meta)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Wrapper for EM.add_timer
|
||||
# Delays add a greater sense of humanity to bot behaviour
|
||||
def delay(time, &b)
|
||||
time = time.to_a.sample unless time.is_a? Integer
|
||||
EM.add_timer(time, &b)
|
||||
end
|
||||
|
||||
# Reply to a tweet or a DM.
|
||||
# Applies configurable @reply_delay range
|
||||
def reply(ev, text, opts={})
|
||||
opts = opts.clone
|
||||
delay = @reply_delay.to_a.sample
|
||||
|
||||
if ev.is_a? Twitter::DirectMessage
|
||||
log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
|
||||
@twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
|
||||
elsif ev.is_a? Twitter::Tweet
|
||||
log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
|
||||
@twitter.update(text, in_reply_to_status_id: ev[:id])
|
||||
else
|
||||
raise Exception("Don't know how to reply to a #{ev.class}")
|
||||
end
|
||||
end
|
||||
|
||||
def scheduler
|
||||
@scheduler ||= Rufus::Scheduler.new
|
||||
end
|
||||
|
||||
def follow(*args)
|
||||
log "Following #{args}"
|
||||
@twitter.follow(*args)
|
||||
end
|
||||
|
||||
def tweet(*args)
|
||||
log "Tweeting #{args.inspect}"
|
||||
@twitter.update(*args)
|
||||
end
|
||||
|
||||
def on_startup(&b); @on_startup = b; end
|
||||
def on_follow(&b); @on_follow = b; end
|
||||
def on_mention(&b); @on_mention = b; end
|
||||
def on_timeline(&b); @on_timeline = b; end
|
||||
def on_message(&b); @on_message = b; end
|
||||
end
|
||||
end
|
81
lib/twitter_ebooks/markov.rb
Normal file
81
lib/twitter_ebooks/markov.rb
Normal file
|
@ -0,0 +1,81 @@
|
|||
module Ebooks
|
||||
# Special INTERIM token represents sentence boundaries
|
||||
# This is so we can include start and end of statements in model
|
||||
# Due to the way the sentence tokenizer works, can correspond
|
||||
# to multiple actual parts of text (such as ^, $, \n and .?!)
|
||||
INTERIM = :interim
|
||||
|
||||
# This is an ngram-based Markov model optimized to build from a
|
||||
# tokenized sentence list without requiring too much transformation
|
||||
class MarkovModel
|
||||
def self.build(sentences)
|
||||
MarkovModel.new.consume(sentences)
|
||||
end
|
||||
|
||||
def consume(sentences)
|
||||
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
|
||||
# We map by both bigrams and unigrams so we can fall back to the latter in
|
||||
# cases where an input bigram is unavailable, such as starting a sentence
|
||||
@sentences = sentences
|
||||
@unigrams = {}
|
||||
@bigrams = {}
|
||||
|
||||
sentences.each_with_index do |tokens, i|
|
||||
last_token = INTERIM
|
||||
tokens.each_with_index do |token, j|
|
||||
@unigrams[last_token] ||= []
|
||||
@unigrams[last_token] << [i, j]
|
||||
|
||||
@bigrams[last_token] ||= {}
|
||||
@bigrams[last_token][token] ||= []
|
||||
|
||||
if j == tokens.length-1 # Mark sentence endings
|
||||
@unigrams[token] ||= []
|
||||
@unigrams[token] << INTERIM
|
||||
@bigrams[last_token][token] << INTERIM
|
||||
else
|
||||
@bigrams[last_token][token] << [i, j+1]
|
||||
end
|
||||
|
||||
last_token = token
|
||||
end
|
||||
end
|
||||
|
||||
self
|
||||
end
|
||||
|
||||
def find_token(index)
|
||||
if index == INTERIM
|
||||
INTERIM
|
||||
else
|
||||
@sentences[index[0]][index[1]]
|
||||
end
|
||||
end
|
||||
|
||||
def chain(tokens)
|
||||
if tokens.length == 1
|
||||
matches = @unigrams[tokens[0]]
|
||||
else
|
||||
matches = @bigrams[tokens[-2]][tokens[-1]]
|
||||
end
|
||||
|
||||
if matches.empty?
|
||||
# This should never happen unless a strange token is
|
||||
# supplied from outside the dataset
|
||||
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
|
||||
end
|
||||
|
||||
next_token = find_token(matches.sample)
|
||||
|
||||
if next_token == INTERIM # We chose to end the sentence
|
||||
return tokens
|
||||
else
|
||||
return chain(tokens + [next_token])
|
||||
end
|
||||
end
|
||||
|
||||
def generate
|
||||
NLP.reconstruct(chain([INTERIM]))
|
||||
end
|
||||
end
|
||||
end
|
120
lib/twitter_ebooks/model.rb
Normal file
120
lib/twitter_ebooks/model.rb
Normal file
|
@ -0,0 +1,120 @@
|
|||
#!/usr/bin/env ruby
|
||||
# encoding: utf-8
|
||||
|
||||
require 'json'
|
||||
require 'set'
|
||||
require 'digest/md5'
|
||||
|
||||
module Ebooks
|
||||
class Model
|
||||
attr_accessor :hash, :sentences, :markov, :keywords
|
||||
|
||||
def self.consume(txtpath)
|
||||
Model.new.consume(txtpath)
|
||||
end
|
||||
|
||||
def self.load(path)
|
||||
Marshal.load(File.read(path))
|
||||
end
|
||||
|
||||
def consume(txtpath)
|
||||
# Record hash of source file so we know to update later
|
||||
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
||||
|
||||
text = File.read(txtpath)
|
||||
log "Removing commented lines and mention tokens"
|
||||
|
||||
lines = text.split("\n")
|
||||
keeping = []
|
||||
lines.each do |l|
|
||||
next if l.start_with?('#') || l.include?('RT')
|
||||
processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
|
||||
keeping << processed.join(' ')
|
||||
end
|
||||
text = NLP.normalize(keeping.join("\n"))
|
||||
|
||||
log "Segmenting text into sentences"
|
||||
|
||||
sentences = NLP.sentences(text)
|
||||
|
||||
log "Tokenizing #{sentences.length} sentences"
|
||||
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
|
||||
|
||||
log "Ranking keywords"
|
||||
@keywords = NLP.keywords(@sentences)
|
||||
|
||||
self
|
||||
end
|
||||
|
||||
def save(path)
|
||||
File.open(path, 'w') do |f|
|
||||
f.write(Marshal.dump(self))
|
||||
end
|
||||
self
|
||||
end
|
||||
|
||||
def fix(tweet)
|
||||
# This seems to require an external api call
|
||||
#begin
|
||||
# fixer = NLP.gingerice.parse(tweet)
|
||||
# log fixer if fixer['corrections']
|
||||
# tweet = fixer['result']
|
||||
#rescue Exception => e
|
||||
# log e.message
|
||||
# log e.backtrace
|
||||
#end
|
||||
|
||||
NLP.htmlentities.decode tweet
|
||||
end
|
||||
|
||||
def markov_statement(limit=140, markov=nil)
|
||||
markov ||= MarkovModel.build(@sentences)
|
||||
tweet = ""
|
||||
|
||||
while (tweet = markov.generate) do
|
||||
next if tweet.length > limit
|
||||
next if NLP.unmatched_enclosers?(tweet)
|
||||
break if tweet.length > limit*0.4 || rand > 0.8
|
||||
end
|
||||
|
||||
fix tweet
|
||||
end
|
||||
|
||||
# Finds all relevant tokenized sentences to given input by
|
||||
# comparing non-stopword token overlaps
|
||||
def relevant_sentences(input)
|
||||
relevant = []
|
||||
slightly_relevant = []
|
||||
|
||||
tokenized = NLP.tokenize(input)
|
||||
|
||||
@sentences.each do |sent|
|
||||
tokenized.each do |token|
|
||||
if sent.include?(token)
|
||||
relevant << sent unless NLP.stopword?(token)
|
||||
slightly_relevant << sent
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
[relevant, slightly_relevant]
|
||||
end
|
||||
|
||||
# Generates a response by looking for related sentences
|
||||
# in the corpus and building a smaller markov model from these
|
||||
def markov_response(input, limit=140)
|
||||
# First try
|
||||
relevant, slightly_relevant = relevant_sentences(input)
|
||||
|
||||
if relevant.length >= 3
|
||||
markov = MarkovModel.new.consume(relevant)
|
||||
markov_statement(limit, markov)
|
||||
elsif slightly_relevant.length > 5
|
||||
markov = MarkovModel.new.consume(slightly_relevant)
|
||||
markov_statement(limit, markov)
|
||||
else
|
||||
markov_statement(limit)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
154
lib/twitter_ebooks/nlp.rb
Normal file
154
lib/twitter_ebooks/nlp.rb
Normal file
|
@ -0,0 +1,154 @@
|
|||
# encoding: utf-8
|
||||
require 'fast-stemmer'
|
||||
require 'highscore'
|
||||
|
||||
module Ebooks
|
||||
module NLP
|
||||
# We deliberately limit our punctuation handling to stuff we can do consistently
|
||||
# It'll just be a part of another token if we don't split it out, and that's fine
|
||||
PUNCTUATION = ".?!,"
|
||||
|
||||
# Lazy-load NLP libraries and resources
|
||||
# Some of this stuff is pretty heavy and we don't necessarily need
|
||||
# to be using it all of the time
|
||||
|
||||
def self.stopwords
|
||||
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
||||
end
|
||||
|
||||
def self.nouns
|
||||
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
||||
end
|
||||
|
||||
def self.adjectives
|
||||
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
||||
end
|
||||
|
||||
# POS tagger
|
||||
def self.tagger
|
||||
require 'engtagger'
|
||||
@tagger ||= EngTagger.new
|
||||
end
|
||||
|
||||
# Gingerice text correction service
|
||||
def self.gingerice
|
||||
require 'gingerice'
|
||||
Gingerice::Parser.new # No caching for this one
|
||||
end
|
||||
|
||||
# For decoding html entities
|
||||
def self.htmlentities
|
||||
require 'htmlentities'
|
||||
@htmlentities ||= HTMLEntities.new
|
||||
end
|
||||
|
||||
### Utility functions
|
||||
|
||||
# We don't really want to deal with all this weird unicode punctuation
|
||||
def self.normalize(text)
|
||||
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
||||
end
|
||||
|
||||
# Split text into sentences
|
||||
# We use ad hoc approach because fancy libraries do not deal
|
||||
# especially well with tweet formatting, and we can fake solving
|
||||
# the quote problem during generation
|
||||
def self.sentences(text)
|
||||
text.split(/\n+|(?<=[.?!])\s+/)
|
||||
end
|
||||
|
||||
# Split a sentence into word-level tokens
|
||||
# As above, this is ad hoc because tokenization libraries
|
||||
# do not behave well wrt. things like emoticons and timestamps
|
||||
def self.tokenize(sentence)
|
||||
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
|
||||
sentence.split(regex)
|
||||
end
|
||||
|
||||
def self.stem(word)
|
||||
Stemmer::stem_word(word.downcase)
|
||||
end
|
||||
|
||||
def self.keywords(sentences)
|
||||
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
||||
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
|
||||
|
||||
text = Highscore::Content.new(text)
|
||||
|
||||
text.configure do
|
||||
#set :multiplier, 2
|
||||
#set :upper_case, 3
|
||||
#set :long_words, 2
|
||||
#set :long_words_threshold, 15
|
||||
#set :vowels, 1 # => default: 0 = not considered
|
||||
#set :consonants, 5 # => default: 0 = not considered
|
||||
#set :ignore_case, true # => default: false
|
||||
set :word_pattern, /(?<!@)(?<=\s)[\w']+/ # => default: /\w+/
|
||||
#set :stemming, true # => default: false
|
||||
end
|
||||
|
||||
text.keywords
|
||||
end
|
||||
|
||||
# Takes a list of tokens and builds a nice-looking sentence
|
||||
def self.reconstruct(tokens)
|
||||
text = ""
|
||||
last_token = nil
|
||||
tokens.each do |token|
|
||||
next if token == INTERIM
|
||||
text += ' ' if last_token && space_between?(last_token, token)
|
||||
text += token
|
||||
last_token = token
|
||||
end
|
||||
text
|
||||
end
|
||||
|
||||
# Determine if we need to insert a space between two tokens
|
||||
def self.space_between?(token1, token2)
|
||||
p1 = self.punctuation?(token1)
|
||||
p2 = self.punctuation?(token2)
|
||||
if p1 && p2 # "foo?!"
|
||||
false
|
||||
elsif !p1 && p2 # "foo."
|
||||
false
|
||||
elsif p1 && !p2 # "foo. rah"
|
||||
true
|
||||
else # "foo rah"
|
||||
true
|
||||
end
|
||||
end
|
||||
|
||||
def self.punctuation?(token)
|
||||
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
||||
end
|
||||
|
||||
def self.stopword?(token)
|
||||
@stopword_set ||= stopwords.map(&:downcase).to_set
|
||||
@stopword_set.include?(token.downcase)
|
||||
end
|
||||
|
||||
# Determine if a sample of text contains unmatched brackets or quotes
|
||||
# This is one of the more frequent and noticeable failure modes for
|
||||
# the markov generator; we can just tell it to retry
|
||||
def self.unmatched_enclosers?(text)
|
||||
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
||||
enclosers.each do |pair|
|
||||
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
|
||||
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
|
||||
|
||||
opened = 0
|
||||
|
||||
tokenize(text).each do |token|
|
||||
opened += 1 if token.match(starter)
|
||||
opened -= 1 if token.match(ender)
|
||||
|
||||
return true if opened < 0 # Too many ends!
|
||||
end
|
||||
|
||||
return true if opened != 0 # Mismatch somewhere.
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
3
lib/twitter_ebooks/version.rb
Normal file
3
lib/twitter_ebooks/version.rb
Normal file
|
@ -0,0 +1,3 @@
|
|||
module Ebooks
|
||||
VERSION = "2.0.7"
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue