Github time!

This commit is contained in:
Mispy 2013-11-08 06:02:05 +11:00
commit e87dc5862b
27 changed files with 20178 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
.*.swp
pkg

4
Gemfile Normal file
View file

@ -0,0 +1,4 @@
source 'https://rubygems.org'
# Specify your gem's dependencies in libtcod.gemspec
gemspec

78
Gemfile.lock Normal file
View file

@ -0,0 +1,78 @@
PATH
remote: .
specs:
twitter_ebooks (2.0.3)
bloomfilter-rb
engtagger
fast-stemmer
gingerice
highscore
htmlentities
minitest
rufus-scheduler
tweetstream
twitter
GEM
remote: https://rubygems.org/
specs:
addressable (2.3.5)
atomic (1.1.14)
awesome_print (1.2.0)
bloomfilter-rb (2.1.1)
redis
cookiejar (0.3.0)
daemons (1.1.9)
em-http-request (1.0.3)
addressable (>= 2.2.3)
cookiejar
em-socksify
eventmachine (>= 1.0.0.beta.4)
http_parser.rb (>= 0.5.3)
em-socksify (0.3.0)
eventmachine (>= 1.0.0.beta.4)
em-twitter (0.2.2)
eventmachine (~> 1.0)
http_parser.rb (~> 0.5)
simple_oauth (~> 0.1)
engtagger (0.1.2)
eventmachine (1.0.3)
faraday (0.8.8)
multipart-post (~> 1.2.0)
fast-stemmer (1.0.2)
gingerice (1.2.1)
addressable
awesome_print
highscore (1.1.0)
whatlanguage (>= 1.0.0)
htmlentities (4.3.1)
http_parser.rb (0.5.3)
minitest (5.0.8)
multi_json (1.8.2)
multipart-post (1.2.0)
redis (3.0.5)
rufus-scheduler (3.0.2)
tzinfo
simple_oauth (0.2.0)
thread_safe (0.1.3)
atomic
tweetstream (2.5.0)
daemons (~> 1.1)
em-http-request (~> 1.0.2)
em-twitter (~> 0.2)
twitter (~> 4.5)
yajl-ruby (~> 1.1)
twitter (4.8.1)
faraday (~> 0.8, < 0.10)
multi_json (~> 1.0)
simple_oauth (~> 0.2)
tzinfo (1.1.0)
thread_safe (~> 0.1)
whatlanguage (1.0.5)
yajl-ruby (1.1.0)
PLATFORMS
ruby
DEPENDENCIES
twitter_ebooks!

22
LICENSE Normal file
View file

@ -0,0 +1,22 @@
Copyright (c) 2013 Jaiden Mispy
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

4
NOTES.md Normal file
View file

@ -0,0 +1,4 @@
- Files in text/ are preprocessed by `rake consume` and serialized
- e.g. text/foo.tweets becomes consumed/foo.corpus
- `rake consume` looks at hashes to know which it needs to update
- Preprocessed corpus files are loaded at runtime by Corpus.load('foo')

9
README.md Normal file
View file

@ -0,0 +1,9 @@
# twitter\_ebooks 2.0.7
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
## Installation
```bash
gem install twitter_ebooks
```

2
Rakefile Normal file
View file

@ -0,0 +1,2 @@
#!/usr/bin/env rake
require "bundler/gem_tasks"

100
bin/ebooks Executable file
View file

@ -0,0 +1,100 @@
#!/usr/bin/env ruby
require 'twitter_ebooks'
module Ebooks
APP_PATH = Dir.pwd # XXX do some recursive thing instead
def self.new(target)
usage = "Usage: ebooks new <reponame>"
if target.nil?
log usage
exit
end
target = "./#{reponame}"
if File.exists?(target)
log "#{target} already exists. Please remove if you want to recreate."
exit
end
FileUtils.cp_r(SKELETON_PATH, target)
File.open(File.join(target, 'bots.rb'), 'w') do |f|
template = File.read(File.join(SKELETON_PATH, 'bots.rb'))
f.write(template.gsub("{{BOT_NAME}}", reponame))
end
log "New twitter_ebooks app created at #{target}"
end
def self.consume(pathes)
pathes.each do |path|
filename = File.basename(path)
shortname = filename.split('.')[0..-2].join('.')
hash = Digest::MD5.hexdigest(File.read(path))
log "Consuming text corpus: #{filename}"
outpath = File.join(APP_PATH, 'model', "#{shortname}.model")
Model.consume(path).save(outpath)
log "Corpus consumed"
end
end
def self.gen(model_path, input)
model = Model.load(model_path)
if input && !input.empty?
puts "@cmd " + model.markov_response(input, 135)
else
puts model.markov_statement
end
end
def self.score(model_path, input)
model = Model.load(model_path)
model.score_interest(input)
end
def self.archive(username, outpath)
Archiver.new(username, outpath).fetch_tweets
end
def self.tweet(modelpath, username)
load File.join(APP_PATH, 'bots.rb')
model = Model.load(modelpath)
statement = model.markov_statement
log "@#{username}: #{statement}"
bot = Bot.get(username)
bot.configure
bot.tweet(statement)
end
def self.command(args)
usage = """Usage:
ebooks new <reponame>
ebooks consume <corpus_path> [...]
ebooks gen <model_path> [input]
ebooks score <model_path> <input>
ebooks archive <@user> <outpath>
ebooks tweet <model_path> <@bot>
"""
if args.length == 0
log usage
exit
end
case args[0]
when "new" then new(args[1])
when "consume" then consume(args[1..-1])
when "gen" then gen(args[1], args[2..-1].join(' '))
when "score" then score(args[1], args[2..-1].join(' '))
when "archive" then archive(args[1], args[2])
when "tweet" then tweet(args[1], args[2])
end
end
end
Ebooks.command(ARGV)

1466
data/adjectives.txt Normal file

File diff suppressed because it is too large Load diff

2193
data/nouns.txt Normal file

File diff suppressed because it is too large Load diff

843
data/stopwords.txt Normal file
View file

@ -0,0 +1,843 @@
a
able
about
above
abst
accordance
according
accordingly
across
act
actually
added
adj
affected
affecting
affects
after
afterwards
again
against
ah
all
almost
alone
along
already
also
although
always
am
among
amongst
an
and
announce
another
any
anybody
anyhow
anymore
anyone
anything
anyway
anyways
anywhere
apparently
approximately
are
aren
arent
arise
around
as
aside
ask
asking
at
auth
available
away
awfully
b
back
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
beginnings
begins
behind
being
believe
below
beside
besides
between
beyond
biol
both
brief
briefly
but
by
c
ca
came
can
cannot
can't
cause
causes
certain
certainly
co
com
come
comes
contain
containing
contains
could
couldnt
d
date
did
didn't
different
do
does
doesn't
doing
done
don't
down
downwards
due
during
e
each
ed
edu
effect
eg
eight
eighty
either
else
elsewhere
end
ending
enough
especially
et
et-al
etc
even
ever
every
everybody
everyone
everything
everywhere
ex
except
f
far
few
ff
fifth
first
five
fix
followed
following
follows
for
former
formerly
forth
found
four
from
further
furthermore
g
gave
get
gets
getting
give
given
gives
giving
go
goes
gone
got
gotten
h
had
happens
hardly
has
hasn't
have
haven't
having
he
hed
hence
her
here
hereafter
hereby
herein
heres
hereupon
hers
herself
hes
hi
hid
him
himself
his
hither
home
how
howbeit
however
hundred
i
id
ie
if
i'll
im
immediate
immediately
importance
important
in
inc
indeed
index
information
instead
into
invention
inward
is
isn't
it
itd
it'll
its
itself
i've
j
just
k
keep
keeps
kept
kg
km
know
known
knows
l
largely
last
lately
later
latter
latterly
least
less
lest
let
lets
like
liked
likely
line
little
'll
look
looking
looks
ltd
m
made
mainly
make
makes
many
may
maybe
me
mean
means
meantime
meanwhile
merely
mg
might
million
miss
ml
more
moreover
most
mostly
mr
mrs
much
mug
must
my
myself
n
na
name
namely
nay
nd
near
nearly
necessarily
necessary
need
needs
neither
never
nevertheless
new
next
nine
ninety
no
nobody
non
none
nonetheless
noone
nor
normally
nos
not
noted
nothing
now
nowhere
o
obtain
obtained
obviously
of
off
often
oh
ok
okay
old
omitted
on
once
one
ones
only
onto
or
ord
other
others
otherwise
ought
our
ours
ourselves
out
outside
over
overall
owing
own
p
page
pages
part
particular
particularly
past
per
perhaps
placed
please
plus
poorly
possible
possibly
potentially
pp
predominantly
present
previously
primarily
probably
promptly
proud
provides
put
q
que
quickly
quite
qv
r
ran
rather
rd
re
readily
really
recent
recently
ref
refs
regarding
regardless
regards
related
relatively
research
respectively
resulted
resulting
results
right
run
s
said
same
saw
say
saying
says
sec
section
see
seeing
seem
seemed
seeming
seems
seen
self
selves
sent
seven
several
shall
she
shed
she'll
shes
should
shouldn't
show
showed
shown
showns
shows
significant
significantly
similar
similarly
since
six
slightly
so
some
somebody
somehow
someone
somethan
something
sometime
sometimes
somewhat
somewhere
soon
sorry
specifically
specified
specify
specifying
still
stop
strongly
sub
substantially
successfully
such
sufficiently
suggest
sup
sure
t
take
taken
taking
tell
tends
th
than
thank
thanks
thanx
that
that'll
thats
that've
the
their
theirs
them
themselves
then
thence
there
thereafter
thereby
thered
therefore
therein
there'll
thereof
therere
theres
thereto
thereupon
there've
these
they
theyd
they'll
theyre
they've
think
this
those
thou
though
thoughh
thousand
throug
through
throughout
thru
thus
til
tip
to
together
too
took
toward
towards
tried
tries
truly
try
trying
ts
twice
two
u
un
under
unfortunately
unless
unlike
unlikely
until
unto
up
upon
ups
us
use
used
useful
usefully
usefulness
uses
using
usually
v
value
various
've
very
via
viz
vol
vols
vs
w
want
wants
was
wasn't
way
we
wed
welcome
we'll
went
were
weren't
we've
what
whatever
what'll
whats
when
whence
whenever
where
whereafter
whereas
whereby
wherein
wheres
whereupon
wherever
whether
which
while
whim
whither
who
whod
whoever
whole
who'll
whom
whomever
whos
whose
why
widely
willing
wish
with
within
without
won't
words
world
would
wouldn't
www
x
y
yes
yet
you
youd
you'll
your
youre
yours
yourself
yourselves
you've
z
zero
.
?
!
http
don
people
well
will
https
time
good
thing
twitter
pretty
it's
i'm
that's
you're
they're
there's
things
yeah
find
going
work
point
years
guess
bad
problem
real
kind
day
better
lot
stuff
i'd
read
thought
idea
case
word
hey
person
long
Dear
internet
tweet
he's
feel
wrong
call
hard
phone
ago
literally
remember
reason
called
course
bit
question
high
today
told
man
actual
year
three
book
assume
life
true
best
wow
video
times
works
fact
completely
totally
imo
open
lol
haha
cool
yep
ooh
great
ugh
tonight
talk
sounds
hahaha
whoa
cool
we're
guys
sweet
fortunately
hmm
aren't
sadly
talking
you'd
place
yup
what's
y'know
basically
god
shit
holy
interesting
news
guy
wait
oooh
gonna
current
let's
tomorrow
omg
hate
hope
fuck
oops
night
wear
wanna
fun
finally
whoops
nevermind
definitely
context
screen
free
exactly
big
house
half
working
play
heard
hmmm
damn
woah
tho
set
idk
sort
understand
kinda
seriously
btw
she's
hah
aww
ffs
it'd
that'd
hopefully
non
entirely
lots
entire
tend
hullo
clearly
surely
weird
start
help
nope

20
lib/twitter_ebooks.rb Normal file
View file

@ -0,0 +1,20 @@
gem 'minitest'
def log(*args)
STDERR.puts args.map(&:to_s).join(' ')
STDERR.flush
end
module Ebooks
GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
DATA_PATH = File.join(GEM_PATH, 'data')
SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
TEST_PATH = File.join(GEM_PATH, 'test')
TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
end
require 'twitter_ebooks/nlp'
require 'twitter_ebooks/archiver'
require 'twitter_ebooks/markov'
require 'twitter_ebooks/model'
require 'twitter_ebooks/bot'

View file

@ -0,0 +1,82 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'twitter'
module Ebooks
class Archiver
def initialize(username, outpath)
@username = username
@outpath = outpath
@client = Twitter::Client.new
end
# Read exiting corpus into memory.
# Return list of tweet lines and the last tweet id.
def read_corpus
lines = []
since_id = nil
if File.exists?(@outpath)
lines = File.read(@outpath).split("\n")
if lines[0].start_with?('#')
since_id = lines[0].split('# ').last
end
end
[lines, since_id]
end
# Retrieve all available tweets for a given user since the last tweet id
def tweets_since(since_id)
page = 1
retries = 0
tweets = []
max_id = nil
opts = {
count: 200,
include_rts: false,
trim_user: true
}
opts[:since_id] = since_id unless since_id.nil?
loop do
opts[:max_id] = max_id unless max_id.nil?
new = @client.user_timeline(@username, opts)
break if new.length <= 1
puts "Received #{new.length} tweets"
tweets += new
max_id = new.last.id
break
end
tweets
end
def fetch_tweets
lines, since_id = read_corpus
if since_id.nil?
puts "Retrieving tweets from @#{@username}"
else
puts "Retrieving tweets from @#{@username} since #{since_id}"
end
tweets = tweets_since(since_id)
if tweets.length == 0
puts "No new tweets"
return
end
new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
new_since_id = tweets[0].id.to_s
lines = ["# " + new_since_id] + new_lines + lines
corpus = File.open(@outpath, 'w')
corpus.write(lines.join("\n"))
corpus.close
end
end
end

164
lib/twitter_ebooks/bot.rb Normal file
View file

@ -0,0 +1,164 @@
#!/usr/bin/env ruby
require 'twitter'
require 'tweetstream'
require 'rufus/scheduler'
module Ebooks
class Bot
attr_accessor :consumer_key, :consumer_secret,
:oauth_token, :oauth_token_secret
attr_accessor :username
attr_reader :twitter, :stream
@@all = [] # List of all defined bots
def self.all; @@all; end
def self.get(name)
all.find { |bot| bot.username == name }
end
def initialize(username, &b)
# Set defaults
@username = username
# Override with callback
b.call(self)
Bot.all.push(self)
end
def log(*args)
STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
STDERR.flush
end
def configure
TweetStream.configure do |config|
config.consumer_key = @consumer_key
config.consumer_secret = @consumer_secret
config.oauth_token = @oauth_token
config.oauth_token_secret = @oauth_token_secret
end
Twitter.configure do |config|
config.consumer_key = @consumer_key
config.consumer_secret = @consumer_secret
config.oauth_token = @oauth_token
config.oauth_token_secret = @oauth_token_secret
end
@twitter = Twitter::Client.new
@stream = TweetStream::Client.new
end
# Connects to tweetstream and opens event handlers for this bot
def start
configure
@on_startup.call if @on_startup
@stream.on_error do |msg|
log "ERROR: #{msg}"
end
@stream.on_inited do
log "Online!"
end
@stream.on_event(:follow) do |event|
next if event[:source][:screen_name] == @username
log "Followed by #{event[:source][:screen_name]}"
@on_follow.call(event[:source])
end
@stream.on_direct_message do |dm|
next if dm[:sender][:screen_name] == @username # Don't reply to self
log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
@on_message.call(dm)
end
@stream.userstream do |ev|
next unless ev[:text] # If it's not a text-containing tweet, ignore it
next if ev[:user][:screen_name] == @username # Ignore our own tweets
meta = {}
mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
reply_mentions = [ev[:user][:screen_name]] + reply_mentions
meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
meta[:limit] = 140 - meta[:reply_prefix].length
mless = ev[:text]
begin
ev.attrs[:entities][:user_mentions].reverse.each do |entity|
mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]...-1]
end
rescue Exception
p ev.attrs[:entities][:user_mentions]
p ev[:text]
raise
end
meta[:mentionless] = mless
# To check if this is a mention, ensure:
# - The tweet mentions list contains our username
# - The tweet is not being retweeted by somebody else
# - Or soft-retweeted by somebody else
if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
@on_mention.call(ev, meta)
else
@on_timeline.call(ev, meta)
end
end
end
# Wrapper for EM.add_timer
# Delays add a greater sense of humanity to bot behaviour
def delay(time, &b)
time = time.to_a.sample unless time.is_a? Integer
EM.add_timer(time, &b)
end
# Reply to a tweet or a DM.
# Applies configurable @reply_delay range
def reply(ev, text, opts={})
opts = opts.clone
delay = @reply_delay.to_a.sample
if ev.is_a? Twitter::DirectMessage
log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
@twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
elsif ev.is_a? Twitter::Tweet
log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
@twitter.update(text, in_reply_to_status_id: ev[:id])
else
raise Exception("Don't know how to reply to a #{ev.class}")
end
end
def scheduler
@scheduler ||= Rufus::Scheduler.new
end
def follow(*args)
log "Following #{args}"
@twitter.follow(*args)
end
def tweet(*args)
log "Tweeting #{args.inspect}"
@twitter.update(*args)
end
def on_startup(&b); @on_startup = b; end
def on_follow(&b); @on_follow = b; end
def on_mention(&b); @on_mention = b; end
def on_timeline(&b); @on_timeline = b; end
def on_message(&b); @on_message = b; end
end
end

View file

@ -0,0 +1,81 @@
module Ebooks
# Special INTERIM token represents sentence boundaries
# This is so we can include start and end of statements in model
# Due to the way the sentence tokenizer works, can correspond
# to multiple actual parts of text (such as ^, $, \n and .?!)
INTERIM = :interim
# This is an ngram-based Markov model optimized to build from a
# tokenized sentence list without requiring too much transformation
class MarkovModel
def self.build(sentences)
MarkovModel.new.consume(sentences)
end
def consume(sentences)
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
# We map by both bigrams and unigrams so we can fall back to the latter in
# cases where an input bigram is unavailable, such as starting a sentence
@sentences = sentences
@unigrams = {}
@bigrams = {}
sentences.each_with_index do |tokens, i|
last_token = INTERIM
tokens.each_with_index do |token, j|
@unigrams[last_token] ||= []
@unigrams[last_token] << [i, j]
@bigrams[last_token] ||= {}
@bigrams[last_token][token] ||= []
if j == tokens.length-1 # Mark sentence endings
@unigrams[token] ||= []
@unigrams[token] << INTERIM
@bigrams[last_token][token] << INTERIM
else
@bigrams[last_token][token] << [i, j+1]
end
last_token = token
end
end
self
end
def find_token(index)
if index == INTERIM
INTERIM
else
@sentences[index[0]][index[1]]
end
end
def chain(tokens)
if tokens.length == 1
matches = @unigrams[tokens[0]]
else
matches = @bigrams[tokens[-2]][tokens[-1]]
end
if matches.empty?
# This should never happen unless a strange token is
# supplied from outside the dataset
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
end
next_token = find_token(matches.sample)
if next_token == INTERIM # We chose to end the sentence
return tokens
else
return chain(tokens + [next_token])
end
end
def generate
NLP.reconstruct(chain([INTERIM]))
end
end
end

120
lib/twitter_ebooks/model.rb Normal file
View file

@ -0,0 +1,120 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'json'
require 'set'
require 'digest/md5'
module Ebooks
class Model
attr_accessor :hash, :sentences, :markov, :keywords
def self.consume(txtpath)
Model.new.consume(txtpath)
end
def self.load(path)
Marshal.load(File.read(path))
end
def consume(txtpath)
# Record hash of source file so we know to update later
@hash = Digest::MD5.hexdigest(File.read(txtpath))
text = File.read(txtpath)
log "Removing commented lines and mention tokens"
lines = text.split("\n")
keeping = []
lines.each do |l|
next if l.start_with?('#') || l.include?('RT')
processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
keeping << processed.join(' ')
end
text = NLP.normalize(keeping.join("\n"))
log "Segmenting text into sentences"
sentences = NLP.sentences(text)
log "Tokenizing #{sentences.length} sentences"
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
log "Ranking keywords"
@keywords = NLP.keywords(@sentences)
self
end
def save(path)
File.open(path, 'w') do |f|
f.write(Marshal.dump(self))
end
self
end
def fix(tweet)
# This seems to require an external api call
#begin
# fixer = NLP.gingerice.parse(tweet)
# log fixer if fixer['corrections']
# tweet = fixer['result']
#rescue Exception => e
# log e.message
# log e.backtrace
#end
NLP.htmlentities.decode tweet
end
def markov_statement(limit=140, markov=nil)
markov ||= MarkovModel.build(@sentences)
tweet = ""
while (tweet = markov.generate) do
next if tweet.length > limit
next if NLP.unmatched_enclosers?(tweet)
break if tweet.length > limit*0.4 || rand > 0.8
end
fix tweet
end
# Finds all relevant tokenized sentences to given input by
# comparing non-stopword token overlaps
def relevant_sentences(input)
relevant = []
slightly_relevant = []
tokenized = NLP.tokenize(input)
@sentences.each do |sent|
tokenized.each do |token|
if sent.include?(token)
relevant << sent unless NLP.stopword?(token)
slightly_relevant << sent
end
end
end
[relevant, slightly_relevant]
end
# Generates a response by looking for related sentences
# in the corpus and building a smaller markov model from these
def markov_response(input, limit=140)
# First try
relevant, slightly_relevant = relevant_sentences(input)
if relevant.length >= 3
markov = MarkovModel.new.consume(relevant)
markov_statement(limit, markov)
elsif slightly_relevant.length > 5
markov = MarkovModel.new.consume(slightly_relevant)
markov_statement(limit, markov)
else
markov_statement(limit)
end
end
end
end

154
lib/twitter_ebooks/nlp.rb Normal file
View file

@ -0,0 +1,154 @@
# encoding: utf-8
require 'fast-stemmer'
require 'highscore'
module Ebooks
module NLP
# We deliberately limit our punctuation handling to stuff we can do consistently
# It'll just be a part of another token if we don't split it out, and that's fine
PUNCTUATION = ".?!,"
# Lazy-load NLP libraries and resources
# Some of this stuff is pretty heavy and we don't necessarily need
# to be using it all of the time
def self.stopwords
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
end
def self.nouns
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
end
def self.adjectives
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
end
# POS tagger
def self.tagger
require 'engtagger'
@tagger ||= EngTagger.new
end
# Gingerice text correction service
def self.gingerice
require 'gingerice'
Gingerice::Parser.new # No caching for this one
end
# For decoding html entities
def self.htmlentities
require 'htmlentities'
@htmlentities ||= HTMLEntities.new
end
### Utility functions
# We don't really want to deal with all this weird unicode punctuation
def self.normalize(text)
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('', "'").gsub('…', '...')
end
# Split text into sentences
# We use ad hoc approach because fancy libraries do not deal
# especially well with tweet formatting, and we can fake solving
# the quote problem during generation
def self.sentences(text)
text.split(/\n+|(?<=[.?!])\s+/)
end
# Split a sentence into word-level tokens
# As above, this is ad hoc because tokenization libraries
# do not behave well wrt. things like emoticons and timestamps
def self.tokenize(sentence)
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
sentence.split(regex)
end
def self.stem(word)
Stemmer::stem_word(word.downcase)
end
def self.keywords(sentences)
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
text = Highscore::Content.new(text)
text.configure do
#set :multiplier, 2
#set :upper_case, 3
#set :long_words, 2
#set :long_words_threshold, 15
#set :vowels, 1 # => default: 0 = not considered
#set :consonants, 5 # => default: 0 = not considered
#set :ignore_case, true # => default: false
set :word_pattern, /(?<!@)(?<=\s)[\w']+/ # => default: /\w+/
#set :stemming, true # => default: false
end
text.keywords
end
# Takes a list of tokens and builds a nice-looking sentence
def self.reconstruct(tokens)
text = ""
last_token = nil
tokens.each do |token|
next if token == INTERIM
text += ' ' if last_token && space_between?(last_token, token)
text += token
last_token = token
end
text
end
# Determine if we need to insert a space between two tokens
def self.space_between?(token1, token2)
p1 = self.punctuation?(token1)
p2 = self.punctuation?(token2)
if p1 && p2 # "foo?!"
false
elsif !p1 && p2 # "foo."
false
elsif p1 && !p2 # "foo. rah"
true
else # "foo rah"
true
end
end
def self.punctuation?(token)
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
end
def self.stopword?(token)
@stopword_set ||= stopwords.map(&:downcase).to_set
@stopword_set.include?(token.downcase)
end
# Determine if a sample of text contains unmatched brackets or quotes
# This is one of the more frequent and noticeable failure modes for
# the markov generator; we can just tell it to retry
def self.unmatched_enclosers?(text)
enclosers = ['**', '""', '()', '[]', '``', "''"]
enclosers.each do |pair|
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
opened = 0
tokenize(text).each do |token|
opened += 1 if token.match(starter)
opened -= 1 if token.match(ender)
return true if opened < 0 # Too many ends!
end
return true if opened != 0 # Mismatch somewhere.
end
false
end
end
end

View file

@ -0,0 +1,3 @@
module Ebooks
VERSION = "2.0.7"
end

19
script/process_anc_data.rb Executable file
View file

@ -0,0 +1,19 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'json'
freqmap = {}
data = File.read("data/ANC-all-count.txt")
data = data.unpack("C*").pack("U*")
data.lines.each do |l|
vals = l.split("\t")
freqmap[vals[0]] = vals[-1].to_i
end
File.open("data/wordfreq.json", 'w') do |f|
f.write(JSON.dump(freqmap))
end

1
skeleton/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
corpus/*

1
skeleton/Procfile Normal file
View file

@ -0,0 +1 @@
worker: ruby run.rb start

41
skeleton/bots.rb Normal file
View file

@ -0,0 +1,41 @@
#!/usr/bin/env ruby
require 'twitter_ebooks'
# This is an example bot definition with event handlers commented out
# You can define as many of these as you like; they will run simultaneously
Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
# Consumer details come from registering an app at https://dev.twitter.com/
# OAuth details can be fetched with https://github.com/marcel/twurl
bot.consumer_key = "" # Your app consumer key
bot.consumer_secret = "" # Your app consumer secret
bot.oauth_token = "" # Token connecting the app to this account
bot.oauth_token_secret = "" # Secret connecting the app to this account
bot.on_message do |dm|
# Reply to a DM
# bot.reply(dm, "secret secrets")
end
bot.on_follow do |user|
# Follow a user back
# bot.follow(user[:screen_name])
end
bot.on_mention do |tweet, meta|
# Reply to a mention
# bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
end
bot.on_timeline do |tweet, meta|
# Reply to a tweet in the bot's timeline
# bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
end
bot.scheduler.every '24h' do
# Tweet something every 24 hours
# See https://github.com/jmettraux/rufus-scheduler
# bot.tweet("hi")
end
end

9
skeleton/run.rb Executable file
View file

@ -0,0 +1,9 @@
#!/usr/bin/env ruby
require_relative 'bots'
EM.run do
Ebooks::Bot.all.each do |bot|
bot.start
end
end

14696
test/corpus/0xabad1dea.tweets Normal file

File diff suppressed because it is too large Load diff

18
test/keywords.rb Executable file
View file

@ -0,0 +1,18 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'twitter_ebooks'
require 'minitest/autorun'
require 'benchmark'
module Ebooks
class TestKeywords < Minitest::Test
corpus = NLP.normalize(File.read(ARGV[0]))
puts "Finding and ranking keywords"
puts Benchmark.measure {
NLP.keywords(corpus).top(50).each do |keyword|
puts "#{keyword.text} #{keyword.weight}"
end
}
end
end

18
test/tokenize.rb Executable file
View file

@ -0,0 +1,18 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'twitter_ebooks'
require 'minitest/autorun'
module Ebooks
class TestTokenize < Minitest::Test
corpus = NLP.normalize(File.read(TEST_CORPUS_PATH))
sents = NLP.sentences(corpus).sample(10)
NLP.sentences(corpus).sample(10).each do |sent|
p sent
p NLP.tokenize(sent)
puts
end
end
end

28
twitter_ebooks.gemspec Normal file
View file

@ -0,0 +1,28 @@
# -*- encoding: utf-8 -*-
require File.expand_path('../lib/twitter_ebooks/version', __FILE__)
Gem::Specification.new do |gem|
gem.authors = ["Jaiden Mispy"]
gem.email = ["^_^@mispy.me"]
gem.description = %q{Markov chains for all your friends~}
gem.summary = %q{Markov chains for all your friends~}
gem.homepage = ""
gem.files = `git ls-files`.split($\)
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
gem.name = "twitter_ebooks"
gem.require_paths = ["lib"]
gem.version = Ebooks::VERSION
gem.add_runtime_dependency 'minitest'
gem.add_runtime_dependency 'twitter'
gem.add_runtime_dependency 'tweetstream'
gem.add_runtime_dependency 'rufus-scheduler'
gem.add_runtime_dependency 'gingerice'
gem.add_runtime_dependency 'htmlentities'
gem.add_runtime_dependency 'engtagger'
gem.add_runtime_dependency 'fast-stemmer'
gem.add_runtime_dependency 'highscore'
end