Github time!

This commit is contained in:
Mispy 2013-11-08 06:02:05 +11:00
commit e87dc5862b
27 changed files with 20178 additions and 0 deletions

20
lib/twitter_ebooks.rb Normal file
View file

@ -0,0 +1,20 @@
gem 'minitest'
def log(*args)
STDERR.puts args.map(&:to_s).join(' ')
STDERR.flush
end
module Ebooks
GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
DATA_PATH = File.join(GEM_PATH, 'data')
SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
TEST_PATH = File.join(GEM_PATH, 'test')
TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
end
require 'twitter_ebooks/nlp'
require 'twitter_ebooks/archiver'
require 'twitter_ebooks/markov'
require 'twitter_ebooks/model'
require 'twitter_ebooks/bot'

View file

@ -0,0 +1,82 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'twitter'
module Ebooks
class Archiver
def initialize(username, outpath)
@username = username
@outpath = outpath
@client = Twitter::Client.new
end
# Read exiting corpus into memory.
# Return list of tweet lines and the last tweet id.
def read_corpus
lines = []
since_id = nil
if File.exists?(@outpath)
lines = File.read(@outpath).split("\n")
if lines[0].start_with?('#')
since_id = lines[0].split('# ').last
end
end
[lines, since_id]
end
# Retrieve all available tweets for a given user since the last tweet id
def tweets_since(since_id)
page = 1
retries = 0
tweets = []
max_id = nil
opts = {
count: 200,
include_rts: false,
trim_user: true
}
opts[:since_id] = since_id unless since_id.nil?
loop do
opts[:max_id] = max_id unless max_id.nil?
new = @client.user_timeline(@username, opts)
break if new.length <= 1
puts "Received #{new.length} tweets"
tweets += new
max_id = new.last.id
break
end
tweets
end
def fetch_tweets
lines, since_id = read_corpus
if since_id.nil?
puts "Retrieving tweets from @#{@username}"
else
puts "Retrieving tweets from @#{@username} since #{since_id}"
end
tweets = tweets_since(since_id)
if tweets.length == 0
puts "No new tweets"
return
end
new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
new_since_id = tweets[0].id.to_s
lines = ["# " + new_since_id] + new_lines + lines
corpus = File.open(@outpath, 'w')
corpus.write(lines.join("\n"))
corpus.close
end
end
end

164
lib/twitter_ebooks/bot.rb Normal file
View file

@ -0,0 +1,164 @@
#!/usr/bin/env ruby
require 'twitter'
require 'tweetstream'
require 'rufus/scheduler'
module Ebooks
class Bot
attr_accessor :consumer_key, :consumer_secret,
:oauth_token, :oauth_token_secret
attr_accessor :username
attr_reader :twitter, :stream
@@all = [] # List of all defined bots
def self.all; @@all; end
def self.get(name)
all.find { |bot| bot.username == name }
end
def initialize(username, &b)
# Set defaults
@username = username
# Override with callback
b.call(self)
Bot.all.push(self)
end
def log(*args)
STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
STDERR.flush
end
def configure
TweetStream.configure do |config|
config.consumer_key = @consumer_key
config.consumer_secret = @consumer_secret
config.oauth_token = @oauth_token
config.oauth_token_secret = @oauth_token_secret
end
Twitter.configure do |config|
config.consumer_key = @consumer_key
config.consumer_secret = @consumer_secret
config.oauth_token = @oauth_token
config.oauth_token_secret = @oauth_token_secret
end
@twitter = Twitter::Client.new
@stream = TweetStream::Client.new
end
# Connects to tweetstream and opens event handlers for this bot
def start
configure
@on_startup.call if @on_startup
@stream.on_error do |msg|
log "ERROR: #{msg}"
end
@stream.on_inited do
log "Online!"
end
@stream.on_event(:follow) do |event|
next if event[:source][:screen_name] == @username
log "Followed by #{event[:source][:screen_name]}"
@on_follow.call(event[:source])
end
@stream.on_direct_message do |dm|
next if dm[:sender][:screen_name] == @username # Don't reply to self
log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
@on_message.call(dm)
end
@stream.userstream do |ev|
next unless ev[:text] # If it's not a text-containing tweet, ignore it
next if ev[:user][:screen_name] == @username # Ignore our own tweets
meta = {}
mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
reply_mentions = [ev[:user][:screen_name]] + reply_mentions
meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
meta[:limit] = 140 - meta[:reply_prefix].length
mless = ev[:text]
begin
ev.attrs[:entities][:user_mentions].reverse.each do |entity|
mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]...-1]
end
rescue Exception
p ev.attrs[:entities][:user_mentions]
p ev[:text]
raise
end
meta[:mentionless] = mless
# To check if this is a mention, ensure:
# - The tweet mentions list contains our username
# - The tweet is not being retweeted by somebody else
# - Or soft-retweeted by somebody else
if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
@on_mention.call(ev, meta)
else
@on_timeline.call(ev, meta)
end
end
end
# Wrapper for EM.add_timer
# Delays add a greater sense of humanity to bot behaviour
def delay(time, &b)
time = time.to_a.sample unless time.is_a? Integer
EM.add_timer(time, &b)
end
# Reply to a tweet or a DM.
# Applies configurable @reply_delay range
def reply(ev, text, opts={})
opts = opts.clone
delay = @reply_delay.to_a.sample
if ev.is_a? Twitter::DirectMessage
log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
@twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
elsif ev.is_a? Twitter::Tweet
log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
@twitter.update(text, in_reply_to_status_id: ev[:id])
else
raise Exception("Don't know how to reply to a #{ev.class}")
end
end
def scheduler
@scheduler ||= Rufus::Scheduler.new
end
def follow(*args)
log "Following #{args}"
@twitter.follow(*args)
end
def tweet(*args)
log "Tweeting #{args.inspect}"
@twitter.update(*args)
end
def on_startup(&b); @on_startup = b; end
def on_follow(&b); @on_follow = b; end
def on_mention(&b); @on_mention = b; end
def on_timeline(&b); @on_timeline = b; end
def on_message(&b); @on_message = b; end
end
end

View file

@ -0,0 +1,81 @@
module Ebooks
# Special INTERIM token represents sentence boundaries
# This is so we can include start and end of statements in model
# Due to the way the sentence tokenizer works, can correspond
# to multiple actual parts of text (such as ^, $, \n and .?!)
INTERIM = :interim
# This is an ngram-based Markov model optimized to build from a
# tokenized sentence list without requiring too much transformation
class MarkovModel
def self.build(sentences)
MarkovModel.new.consume(sentences)
end
def consume(sentences)
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
# We map by both bigrams and unigrams so we can fall back to the latter in
# cases where an input bigram is unavailable, such as starting a sentence
@sentences = sentences
@unigrams = {}
@bigrams = {}
sentences.each_with_index do |tokens, i|
last_token = INTERIM
tokens.each_with_index do |token, j|
@unigrams[last_token] ||= []
@unigrams[last_token] << [i, j]
@bigrams[last_token] ||= {}
@bigrams[last_token][token] ||= []
if j == tokens.length-1 # Mark sentence endings
@unigrams[token] ||= []
@unigrams[token] << INTERIM
@bigrams[last_token][token] << INTERIM
else
@bigrams[last_token][token] << [i, j+1]
end
last_token = token
end
end
self
end
def find_token(index)
if index == INTERIM
INTERIM
else
@sentences[index[0]][index[1]]
end
end
def chain(tokens)
if tokens.length == 1
matches = @unigrams[tokens[0]]
else
matches = @bigrams[tokens[-2]][tokens[-1]]
end
if matches.empty?
# This should never happen unless a strange token is
# supplied from outside the dataset
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
end
next_token = find_token(matches.sample)
if next_token == INTERIM # We chose to end the sentence
return tokens
else
return chain(tokens + [next_token])
end
end
def generate
NLP.reconstruct(chain([INTERIM]))
end
end
end

120
lib/twitter_ebooks/model.rb Normal file
View file

@ -0,0 +1,120 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'json'
require 'set'
require 'digest/md5'
module Ebooks
class Model
attr_accessor :hash, :sentences, :markov, :keywords
def self.consume(txtpath)
Model.new.consume(txtpath)
end
def self.load(path)
Marshal.load(File.read(path))
end
def consume(txtpath)
# Record hash of source file so we know to update later
@hash = Digest::MD5.hexdigest(File.read(txtpath))
text = File.read(txtpath)
log "Removing commented lines and mention tokens"
lines = text.split("\n")
keeping = []
lines.each do |l|
next if l.start_with?('#') || l.include?('RT')
processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
keeping << processed.join(' ')
end
text = NLP.normalize(keeping.join("\n"))
log "Segmenting text into sentences"
sentences = NLP.sentences(text)
log "Tokenizing #{sentences.length} sentences"
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
log "Ranking keywords"
@keywords = NLP.keywords(@sentences)
self
end
def save(path)
File.open(path, 'w') do |f|
f.write(Marshal.dump(self))
end
self
end
def fix(tweet)
# This seems to require an external api call
#begin
# fixer = NLP.gingerice.parse(tweet)
# log fixer if fixer['corrections']
# tweet = fixer['result']
#rescue Exception => e
# log e.message
# log e.backtrace
#end
NLP.htmlentities.decode tweet
end
def markov_statement(limit=140, markov=nil)
markov ||= MarkovModel.build(@sentences)
tweet = ""
while (tweet = markov.generate) do
next if tweet.length > limit
next if NLP.unmatched_enclosers?(tweet)
break if tweet.length > limit*0.4 || rand > 0.8
end
fix tweet
end
# Finds all relevant tokenized sentences to given input by
# comparing non-stopword token overlaps
def relevant_sentences(input)
relevant = []
slightly_relevant = []
tokenized = NLP.tokenize(input)
@sentences.each do |sent|
tokenized.each do |token|
if sent.include?(token)
relevant << sent unless NLP.stopword?(token)
slightly_relevant << sent
end
end
end
[relevant, slightly_relevant]
end
# Generates a response by looking for related sentences
# in the corpus and building a smaller markov model from these
def markov_response(input, limit=140)
# First try
relevant, slightly_relevant = relevant_sentences(input)
if relevant.length >= 3
markov = MarkovModel.new.consume(relevant)
markov_statement(limit, markov)
elsif slightly_relevant.length > 5
markov = MarkovModel.new.consume(slightly_relevant)
markov_statement(limit, markov)
else
markov_statement(limit)
end
end
end
end

154
lib/twitter_ebooks/nlp.rb Normal file
View file

@ -0,0 +1,154 @@
# encoding: utf-8
require 'fast-stemmer'
require 'highscore'
module Ebooks
module NLP
# We deliberately limit our punctuation handling to stuff we can do consistently
# It'll just be a part of another token if we don't split it out, and that's fine
PUNCTUATION = ".?!,"
# Lazy-load NLP libraries and resources
# Some of this stuff is pretty heavy and we don't necessarily need
# to be using it all of the time
def self.stopwords
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
end
def self.nouns
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
end
def self.adjectives
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
end
# POS tagger
def self.tagger
require 'engtagger'
@tagger ||= EngTagger.new
end
# Gingerice text correction service
def self.gingerice
require 'gingerice'
Gingerice::Parser.new # No caching for this one
end
# For decoding html entities
def self.htmlentities
require 'htmlentities'
@htmlentities ||= HTMLEntities.new
end
### Utility functions
# We don't really want to deal with all this weird unicode punctuation
def self.normalize(text)
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('', "'").gsub('…', '...')
end
# Split text into sentences
# We use ad hoc approach because fancy libraries do not deal
# especially well with tweet formatting, and we can fake solving
# the quote problem during generation
def self.sentences(text)
text.split(/\n+|(?<=[.?!])\s+/)
end
# Split a sentence into word-level tokens
# As above, this is ad hoc because tokenization libraries
# do not behave well wrt. things like emoticons and timestamps
def self.tokenize(sentence)
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
sentence.split(regex)
end
def self.stem(word)
Stemmer::stem_word(word.downcase)
end
def self.keywords(sentences)
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
text = Highscore::Content.new(text)
text.configure do
#set :multiplier, 2
#set :upper_case, 3
#set :long_words, 2
#set :long_words_threshold, 15
#set :vowels, 1 # => default: 0 = not considered
#set :consonants, 5 # => default: 0 = not considered
#set :ignore_case, true # => default: false
set :word_pattern, /(?<!@)(?<=\s)[\w']+/ # => default: /\w+/
#set :stemming, true # => default: false
end
text.keywords
end
# Takes a list of tokens and builds a nice-looking sentence
def self.reconstruct(tokens)
text = ""
last_token = nil
tokens.each do |token|
next if token == INTERIM
text += ' ' if last_token && space_between?(last_token, token)
text += token
last_token = token
end
text
end
# Determine if we need to insert a space between two tokens
def self.space_between?(token1, token2)
p1 = self.punctuation?(token1)
p2 = self.punctuation?(token2)
if p1 && p2 # "foo?!"
false
elsif !p1 && p2 # "foo."
false
elsif p1 && !p2 # "foo. rah"
true
else # "foo rah"
true
end
end
def self.punctuation?(token)
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
end
def self.stopword?(token)
@stopword_set ||= stopwords.map(&:downcase).to_set
@stopword_set.include?(token.downcase)
end
# Determine if a sample of text contains unmatched brackets or quotes
# This is one of the more frequent and noticeable failure modes for
# the markov generator; we can just tell it to retry
def self.unmatched_enclosers?(text)
enclosers = ['**', '""', '()', '[]', '``', "''"]
enclosers.each do |pair|
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
opened = 0
tokenize(text).each do |token|
opened += 1 if token.match(starter)
opened -= 1 if token.match(ender)
return true if opened < 0 # Too many ends!
end
return true if opened != 0 # Mismatch somewhere.
end
false
end
end
end

View file

@ -0,0 +1,3 @@
module Ebooks
VERSION = "2.0.7"
end