Github time!
This commit is contained in:
commit
e87dc5862b
27 changed files with 20178 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
.*.swp
|
||||
pkg
|
4
Gemfile
Normal file
4
Gemfile
Normal file
|
@ -0,0 +1,4 @@
|
|||
source 'https://rubygems.org'
|
||||
|
||||
# Specify your gem's dependencies in libtcod.gemspec
|
||||
gemspec
|
78
Gemfile.lock
Normal file
78
Gemfile.lock
Normal file
|
@ -0,0 +1,78 @@
|
|||
PATH
|
||||
remote: .
|
||||
specs:
|
||||
twitter_ebooks (2.0.3)
|
||||
bloomfilter-rb
|
||||
engtagger
|
||||
fast-stemmer
|
||||
gingerice
|
||||
highscore
|
||||
htmlentities
|
||||
minitest
|
||||
rufus-scheduler
|
||||
tweetstream
|
||||
twitter
|
||||
|
||||
GEM
|
||||
remote: https://rubygems.org/
|
||||
specs:
|
||||
addressable (2.3.5)
|
||||
atomic (1.1.14)
|
||||
awesome_print (1.2.0)
|
||||
bloomfilter-rb (2.1.1)
|
||||
redis
|
||||
cookiejar (0.3.0)
|
||||
daemons (1.1.9)
|
||||
em-http-request (1.0.3)
|
||||
addressable (>= 2.2.3)
|
||||
cookiejar
|
||||
em-socksify
|
||||
eventmachine (>= 1.0.0.beta.4)
|
||||
http_parser.rb (>= 0.5.3)
|
||||
em-socksify (0.3.0)
|
||||
eventmachine (>= 1.0.0.beta.4)
|
||||
em-twitter (0.2.2)
|
||||
eventmachine (~> 1.0)
|
||||
http_parser.rb (~> 0.5)
|
||||
simple_oauth (~> 0.1)
|
||||
engtagger (0.1.2)
|
||||
eventmachine (1.0.3)
|
||||
faraday (0.8.8)
|
||||
multipart-post (~> 1.2.0)
|
||||
fast-stemmer (1.0.2)
|
||||
gingerice (1.2.1)
|
||||
addressable
|
||||
awesome_print
|
||||
highscore (1.1.0)
|
||||
whatlanguage (>= 1.0.0)
|
||||
htmlentities (4.3.1)
|
||||
http_parser.rb (0.5.3)
|
||||
minitest (5.0.8)
|
||||
multi_json (1.8.2)
|
||||
multipart-post (1.2.0)
|
||||
redis (3.0.5)
|
||||
rufus-scheduler (3.0.2)
|
||||
tzinfo
|
||||
simple_oauth (0.2.0)
|
||||
thread_safe (0.1.3)
|
||||
atomic
|
||||
tweetstream (2.5.0)
|
||||
daemons (~> 1.1)
|
||||
em-http-request (~> 1.0.2)
|
||||
em-twitter (~> 0.2)
|
||||
twitter (~> 4.5)
|
||||
yajl-ruby (~> 1.1)
|
||||
twitter (4.8.1)
|
||||
faraday (~> 0.8, < 0.10)
|
||||
multi_json (~> 1.0)
|
||||
simple_oauth (~> 0.2)
|
||||
tzinfo (1.1.0)
|
||||
thread_safe (~> 0.1)
|
||||
whatlanguage (1.0.5)
|
||||
yajl-ruby (1.1.0)
|
||||
|
||||
PLATFORMS
|
||||
ruby
|
||||
|
||||
DEPENDENCIES
|
||||
twitter_ebooks!
|
22
LICENSE
Normal file
22
LICENSE
Normal file
|
@ -0,0 +1,22 @@
|
|||
Copyright (c) 2013 Jaiden Mispy
|
||||
|
||||
MIT License
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
4
NOTES.md
Normal file
4
NOTES.md
Normal file
|
@ -0,0 +1,4 @@
|
|||
- Files in text/ are preprocessed by `rake consume` and serialized
|
||||
- e.g. text/foo.tweets becomes consumed/foo.corpus
|
||||
- `rake consume` looks at hashes to know which it needs to update
|
||||
- Preprocessed corpus files are loaded at runtime by Corpus.load('foo')
|
9
README.md
Normal file
9
README.md
Normal file
|
@ -0,0 +1,9 @@
|
|||
# twitter\_ebooks 2.0.7
|
||||
|
||||
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
gem install twitter_ebooks
|
||||
```
|
2
Rakefile
Normal file
2
Rakefile
Normal file
|
@ -0,0 +1,2 @@
|
|||
#!/usr/bin/env rake
|
||||
require "bundler/gem_tasks"
|
100
bin/ebooks
Executable file
100
bin/ebooks
Executable file
|
@ -0,0 +1,100 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require 'twitter_ebooks'
|
||||
|
||||
module Ebooks
|
||||
APP_PATH = Dir.pwd # XXX do some recursive thing instead
|
||||
|
||||
def self.new(target)
|
||||
usage = "Usage: ebooks new <reponame>"
|
||||
|
||||
if target.nil?
|
||||
log usage
|
||||
exit
|
||||
end
|
||||
|
||||
target = "./#{reponame}"
|
||||
|
||||
if File.exists?(target)
|
||||
log "#{target} already exists. Please remove if you want to recreate."
|
||||
exit
|
||||
end
|
||||
|
||||
FileUtils.cp_r(SKELETON_PATH, target)
|
||||
|
||||
File.open(File.join(target, 'bots.rb'), 'w') do |f|
|
||||
template = File.read(File.join(SKELETON_PATH, 'bots.rb'))
|
||||
f.write(template.gsub("{{BOT_NAME}}", reponame))
|
||||
end
|
||||
|
||||
log "New twitter_ebooks app created at #{target}"
|
||||
end
|
||||
|
||||
def self.consume(pathes)
|
||||
pathes.each do |path|
|
||||
filename = File.basename(path)
|
||||
shortname = filename.split('.')[0..-2].join('.')
|
||||
hash = Digest::MD5.hexdigest(File.read(path))
|
||||
|
||||
log "Consuming text corpus: #{filename}"
|
||||
outpath = File.join(APP_PATH, 'model', "#{shortname}.model")
|
||||
Model.consume(path).save(outpath)
|
||||
log "Corpus consumed"
|
||||
end
|
||||
end
|
||||
|
||||
def self.gen(model_path, input)
|
||||
model = Model.load(model_path)
|
||||
if input && !input.empty?
|
||||
puts "@cmd " + model.markov_response(input, 135)
|
||||
else
|
||||
puts model.markov_statement
|
||||
end
|
||||
end
|
||||
|
||||
def self.score(model_path, input)
|
||||
model = Model.load(model_path)
|
||||
model.score_interest(input)
|
||||
end
|
||||
|
||||
def self.archive(username, outpath)
|
||||
Archiver.new(username, outpath).fetch_tweets
|
||||
end
|
||||
|
||||
def self.tweet(modelpath, username)
|
||||
load File.join(APP_PATH, 'bots.rb')
|
||||
model = Model.load(modelpath)
|
||||
statement = model.markov_statement
|
||||
log "@#{username}: #{statement}"
|
||||
bot = Bot.get(username)
|
||||
bot.configure
|
||||
bot.tweet(statement)
|
||||
end
|
||||
|
||||
def self.command(args)
|
||||
usage = """Usage:
|
||||
ebooks new <reponame>
|
||||
ebooks consume <corpus_path> [...]
|
||||
ebooks gen <model_path> [input]
|
||||
ebooks score <model_path> <input>
|
||||
ebooks archive <@user> <outpath>
|
||||
ebooks tweet <model_path> <@bot>
|
||||
"""
|
||||
|
||||
if args.length == 0
|
||||
log usage
|
||||
exit
|
||||
end
|
||||
|
||||
case args[0]
|
||||
when "new" then new(args[1])
|
||||
when "consume" then consume(args[1..-1])
|
||||
when "gen" then gen(args[1], args[2..-1].join(' '))
|
||||
when "score" then score(args[1], args[2..-1].join(' '))
|
||||
when "archive" then archive(args[1], args[2])
|
||||
when "tweet" then tweet(args[1], args[2])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Ebooks.command(ARGV)
|
1466
data/adjectives.txt
Normal file
1466
data/adjectives.txt
Normal file
File diff suppressed because it is too large
Load diff
2193
data/nouns.txt
Normal file
2193
data/nouns.txt
Normal file
File diff suppressed because it is too large
Load diff
843
data/stopwords.txt
Normal file
843
data/stopwords.txt
Normal file
|
@ -0,0 +1,843 @@
|
|||
a
|
||||
able
|
||||
about
|
||||
above
|
||||
abst
|
||||
accordance
|
||||
according
|
||||
accordingly
|
||||
across
|
||||
act
|
||||
actually
|
||||
added
|
||||
adj
|
||||
affected
|
||||
affecting
|
||||
affects
|
||||
after
|
||||
afterwards
|
||||
again
|
||||
against
|
||||
ah
|
||||
all
|
||||
almost
|
||||
alone
|
||||
along
|
||||
already
|
||||
also
|
||||
although
|
||||
always
|
||||
am
|
||||
among
|
||||
amongst
|
||||
an
|
||||
and
|
||||
announce
|
||||
another
|
||||
any
|
||||
anybody
|
||||
anyhow
|
||||
anymore
|
||||
anyone
|
||||
anything
|
||||
anyway
|
||||
anyways
|
||||
anywhere
|
||||
apparently
|
||||
approximately
|
||||
are
|
||||
aren
|
||||
arent
|
||||
arise
|
||||
around
|
||||
as
|
||||
aside
|
||||
ask
|
||||
asking
|
||||
at
|
||||
auth
|
||||
available
|
||||
away
|
||||
awfully
|
||||
b
|
||||
back
|
||||
be
|
||||
became
|
||||
because
|
||||
become
|
||||
becomes
|
||||
becoming
|
||||
been
|
||||
before
|
||||
beforehand
|
||||
begin
|
||||
beginning
|
||||
beginnings
|
||||
begins
|
||||
behind
|
||||
being
|
||||
believe
|
||||
below
|
||||
beside
|
||||
besides
|
||||
between
|
||||
beyond
|
||||
biol
|
||||
both
|
||||
brief
|
||||
briefly
|
||||
but
|
||||
by
|
||||
c
|
||||
ca
|
||||
came
|
||||
can
|
||||
cannot
|
||||
can't
|
||||
cause
|
||||
causes
|
||||
certain
|
||||
certainly
|
||||
co
|
||||
com
|
||||
come
|
||||
comes
|
||||
contain
|
||||
containing
|
||||
contains
|
||||
could
|
||||
couldnt
|
||||
d
|
||||
date
|
||||
did
|
||||
didn't
|
||||
different
|
||||
do
|
||||
does
|
||||
doesn't
|
||||
doing
|
||||
done
|
||||
don't
|
||||
down
|
||||
downwards
|
||||
due
|
||||
during
|
||||
e
|
||||
each
|
||||
ed
|
||||
edu
|
||||
effect
|
||||
eg
|
||||
eight
|
||||
eighty
|
||||
either
|
||||
else
|
||||
elsewhere
|
||||
end
|
||||
ending
|
||||
enough
|
||||
especially
|
||||
et
|
||||
et-al
|
||||
etc
|
||||
even
|
||||
ever
|
||||
every
|
||||
everybody
|
||||
everyone
|
||||
everything
|
||||
everywhere
|
||||
ex
|
||||
except
|
||||
f
|
||||
far
|
||||
few
|
||||
ff
|
||||
fifth
|
||||
first
|
||||
five
|
||||
fix
|
||||
followed
|
||||
following
|
||||
follows
|
||||
for
|
||||
former
|
||||
formerly
|
||||
forth
|
||||
found
|
||||
four
|
||||
from
|
||||
further
|
||||
furthermore
|
||||
g
|
||||
gave
|
||||
get
|
||||
gets
|
||||
getting
|
||||
give
|
||||
given
|
||||
gives
|
||||
giving
|
||||
go
|
||||
goes
|
||||
gone
|
||||
got
|
||||
gotten
|
||||
h
|
||||
had
|
||||
happens
|
||||
hardly
|
||||
has
|
||||
hasn't
|
||||
have
|
||||
haven't
|
||||
having
|
||||
he
|
||||
hed
|
||||
hence
|
||||
her
|
||||
here
|
||||
hereafter
|
||||
hereby
|
||||
herein
|
||||
heres
|
||||
hereupon
|
||||
hers
|
||||
herself
|
||||
hes
|
||||
hi
|
||||
hid
|
||||
him
|
||||
himself
|
||||
his
|
||||
hither
|
||||
home
|
||||
how
|
||||
howbeit
|
||||
however
|
||||
hundred
|
||||
i
|
||||
id
|
||||
ie
|
||||
if
|
||||
i'll
|
||||
im
|
||||
immediate
|
||||
immediately
|
||||
importance
|
||||
important
|
||||
in
|
||||
inc
|
||||
indeed
|
||||
index
|
||||
information
|
||||
instead
|
||||
into
|
||||
invention
|
||||
inward
|
||||
is
|
||||
isn't
|
||||
it
|
||||
itd
|
||||
it'll
|
||||
its
|
||||
itself
|
||||
i've
|
||||
j
|
||||
just
|
||||
k
|
||||
keep
|
||||
keeps
|
||||
kept
|
||||
kg
|
||||
km
|
||||
know
|
||||
known
|
||||
knows
|
||||
l
|
||||
largely
|
||||
last
|
||||
lately
|
||||
later
|
||||
latter
|
||||
latterly
|
||||
least
|
||||
less
|
||||
lest
|
||||
let
|
||||
lets
|
||||
like
|
||||
liked
|
||||
likely
|
||||
line
|
||||
little
|
||||
'll
|
||||
look
|
||||
looking
|
||||
looks
|
||||
ltd
|
||||
m
|
||||
made
|
||||
mainly
|
||||
make
|
||||
makes
|
||||
many
|
||||
may
|
||||
maybe
|
||||
me
|
||||
mean
|
||||
means
|
||||
meantime
|
||||
meanwhile
|
||||
merely
|
||||
mg
|
||||
might
|
||||
million
|
||||
miss
|
||||
ml
|
||||
more
|
||||
moreover
|
||||
most
|
||||
mostly
|
||||
mr
|
||||
mrs
|
||||
much
|
||||
mug
|
||||
must
|
||||
my
|
||||
myself
|
||||
n
|
||||
na
|
||||
name
|
||||
namely
|
||||
nay
|
||||
nd
|
||||
near
|
||||
nearly
|
||||
necessarily
|
||||
necessary
|
||||
need
|
||||
needs
|
||||
neither
|
||||
never
|
||||
nevertheless
|
||||
new
|
||||
next
|
||||
nine
|
||||
ninety
|
||||
no
|
||||
nobody
|
||||
non
|
||||
none
|
||||
nonetheless
|
||||
noone
|
||||
nor
|
||||
normally
|
||||
nos
|
||||
not
|
||||
noted
|
||||
nothing
|
||||
now
|
||||
nowhere
|
||||
o
|
||||
obtain
|
||||
obtained
|
||||
obviously
|
||||
of
|
||||
off
|
||||
often
|
||||
oh
|
||||
ok
|
||||
okay
|
||||
old
|
||||
omitted
|
||||
on
|
||||
once
|
||||
one
|
||||
ones
|
||||
only
|
||||
onto
|
||||
or
|
||||
ord
|
||||
other
|
||||
others
|
||||
otherwise
|
||||
ought
|
||||
our
|
||||
ours
|
||||
ourselves
|
||||
out
|
||||
outside
|
||||
over
|
||||
overall
|
||||
owing
|
||||
own
|
||||
p
|
||||
page
|
||||
pages
|
||||
part
|
||||
particular
|
||||
particularly
|
||||
past
|
||||
per
|
||||
perhaps
|
||||
placed
|
||||
please
|
||||
plus
|
||||
poorly
|
||||
possible
|
||||
possibly
|
||||
potentially
|
||||
pp
|
||||
predominantly
|
||||
present
|
||||
previously
|
||||
primarily
|
||||
probably
|
||||
promptly
|
||||
proud
|
||||
provides
|
||||
put
|
||||
q
|
||||
que
|
||||
quickly
|
||||
quite
|
||||
qv
|
||||
r
|
||||
ran
|
||||
rather
|
||||
rd
|
||||
re
|
||||
readily
|
||||
really
|
||||
recent
|
||||
recently
|
||||
ref
|
||||
refs
|
||||
regarding
|
||||
regardless
|
||||
regards
|
||||
related
|
||||
relatively
|
||||
research
|
||||
respectively
|
||||
resulted
|
||||
resulting
|
||||
results
|
||||
right
|
||||
run
|
||||
s
|
||||
said
|
||||
same
|
||||
saw
|
||||
say
|
||||
saying
|
||||
says
|
||||
sec
|
||||
section
|
||||
see
|
||||
seeing
|
||||
seem
|
||||
seemed
|
||||
seeming
|
||||
seems
|
||||
seen
|
||||
self
|
||||
selves
|
||||
sent
|
||||
seven
|
||||
several
|
||||
shall
|
||||
she
|
||||
shed
|
||||
she'll
|
||||
shes
|
||||
should
|
||||
shouldn't
|
||||
show
|
||||
showed
|
||||
shown
|
||||
showns
|
||||
shows
|
||||
significant
|
||||
significantly
|
||||
similar
|
||||
similarly
|
||||
since
|
||||
six
|
||||
slightly
|
||||
so
|
||||
some
|
||||
somebody
|
||||
somehow
|
||||
someone
|
||||
somethan
|
||||
something
|
||||
sometime
|
||||
sometimes
|
||||
somewhat
|
||||
somewhere
|
||||
soon
|
||||
sorry
|
||||
specifically
|
||||
specified
|
||||
specify
|
||||
specifying
|
||||
still
|
||||
stop
|
||||
strongly
|
||||
sub
|
||||
substantially
|
||||
successfully
|
||||
such
|
||||
sufficiently
|
||||
suggest
|
||||
sup
|
||||
sure
|
||||
t
|
||||
take
|
||||
taken
|
||||
taking
|
||||
tell
|
||||
tends
|
||||
th
|
||||
than
|
||||
thank
|
||||
thanks
|
||||
thanx
|
||||
that
|
||||
that'll
|
||||
thats
|
||||
that've
|
||||
the
|
||||
their
|
||||
theirs
|
||||
them
|
||||
themselves
|
||||
then
|
||||
thence
|
||||
there
|
||||
thereafter
|
||||
thereby
|
||||
thered
|
||||
therefore
|
||||
therein
|
||||
there'll
|
||||
thereof
|
||||
therere
|
||||
theres
|
||||
thereto
|
||||
thereupon
|
||||
there've
|
||||
these
|
||||
they
|
||||
theyd
|
||||
they'll
|
||||
theyre
|
||||
they've
|
||||
think
|
||||
this
|
||||
those
|
||||
thou
|
||||
though
|
||||
thoughh
|
||||
thousand
|
||||
throug
|
||||
through
|
||||
throughout
|
||||
thru
|
||||
thus
|
||||
til
|
||||
tip
|
||||
to
|
||||
together
|
||||
too
|
||||
took
|
||||
toward
|
||||
towards
|
||||
tried
|
||||
tries
|
||||
truly
|
||||
try
|
||||
trying
|
||||
ts
|
||||
twice
|
||||
two
|
||||
u
|
||||
un
|
||||
under
|
||||
unfortunately
|
||||
unless
|
||||
unlike
|
||||
unlikely
|
||||
until
|
||||
unto
|
||||
up
|
||||
upon
|
||||
ups
|
||||
us
|
||||
use
|
||||
used
|
||||
useful
|
||||
usefully
|
||||
usefulness
|
||||
uses
|
||||
using
|
||||
usually
|
||||
v
|
||||
value
|
||||
various
|
||||
've
|
||||
very
|
||||
via
|
||||
viz
|
||||
vol
|
||||
vols
|
||||
vs
|
||||
w
|
||||
want
|
||||
wants
|
||||
was
|
||||
wasn't
|
||||
way
|
||||
we
|
||||
wed
|
||||
welcome
|
||||
we'll
|
||||
went
|
||||
were
|
||||
weren't
|
||||
we've
|
||||
what
|
||||
whatever
|
||||
what'll
|
||||
whats
|
||||
when
|
||||
whence
|
||||
whenever
|
||||
where
|
||||
whereafter
|
||||
whereas
|
||||
whereby
|
||||
wherein
|
||||
wheres
|
||||
whereupon
|
||||
wherever
|
||||
whether
|
||||
which
|
||||
while
|
||||
whim
|
||||
whither
|
||||
who
|
||||
whod
|
||||
whoever
|
||||
whole
|
||||
who'll
|
||||
whom
|
||||
whomever
|
||||
whos
|
||||
whose
|
||||
why
|
||||
widely
|
||||
willing
|
||||
wish
|
||||
with
|
||||
within
|
||||
without
|
||||
won't
|
||||
words
|
||||
world
|
||||
would
|
||||
wouldn't
|
||||
www
|
||||
x
|
||||
y
|
||||
yes
|
||||
yet
|
||||
you
|
||||
youd
|
||||
you'll
|
||||
your
|
||||
youre
|
||||
yours
|
||||
yourself
|
||||
yourselves
|
||||
you've
|
||||
z
|
||||
zero
|
||||
.
|
||||
?
|
||||
!
|
||||
|
||||
http
|
||||
don
|
||||
people
|
||||
well
|
||||
will
|
||||
https
|
||||
time
|
||||
good
|
||||
thing
|
||||
twitter
|
||||
pretty
|
||||
it's
|
||||
i'm
|
||||
that's
|
||||
you're
|
||||
they're
|
||||
there's
|
||||
things
|
||||
yeah
|
||||
find
|
||||
going
|
||||
work
|
||||
point
|
||||
years
|
||||
guess
|
||||
bad
|
||||
problem
|
||||
real
|
||||
kind
|
||||
day
|
||||
better
|
||||
lot
|
||||
stuff
|
||||
i'd
|
||||
read
|
||||
thought
|
||||
idea
|
||||
case
|
||||
word
|
||||
hey
|
||||
person
|
||||
long
|
||||
Dear
|
||||
internet
|
||||
tweet
|
||||
he's
|
||||
feel
|
||||
wrong
|
||||
call
|
||||
hard
|
||||
phone
|
||||
ago
|
||||
literally
|
||||
remember
|
||||
reason
|
||||
called
|
||||
course
|
||||
bit
|
||||
question
|
||||
high
|
||||
today
|
||||
told
|
||||
man
|
||||
actual
|
||||
year
|
||||
three
|
||||
book
|
||||
assume
|
||||
life
|
||||
true
|
||||
best
|
||||
wow
|
||||
video
|
||||
times
|
||||
works
|
||||
fact
|
||||
completely
|
||||
totally
|
||||
imo
|
||||
open
|
||||
lol
|
||||
haha
|
||||
cool
|
||||
yep
|
||||
ooh
|
||||
great
|
||||
ugh
|
||||
tonight
|
||||
talk
|
||||
sounds
|
||||
hahaha
|
||||
whoa
|
||||
cool
|
||||
we're
|
||||
guys
|
||||
sweet
|
||||
fortunately
|
||||
hmm
|
||||
aren't
|
||||
sadly
|
||||
talking
|
||||
you'd
|
||||
place
|
||||
yup
|
||||
what's
|
||||
y'know
|
||||
basically
|
||||
god
|
||||
shit
|
||||
holy
|
||||
interesting
|
||||
news
|
||||
guy
|
||||
wait
|
||||
oooh
|
||||
gonna
|
||||
current
|
||||
let's
|
||||
tomorrow
|
||||
omg
|
||||
hate
|
||||
hope
|
||||
fuck
|
||||
oops
|
||||
night
|
||||
wear
|
||||
wanna
|
||||
fun
|
||||
finally
|
||||
whoops
|
||||
nevermind
|
||||
definitely
|
||||
context
|
||||
screen
|
||||
free
|
||||
exactly
|
||||
big
|
||||
house
|
||||
half
|
||||
working
|
||||
play
|
||||
heard
|
||||
hmmm
|
||||
damn
|
||||
woah
|
||||
tho
|
||||
set
|
||||
idk
|
||||
sort
|
||||
understand
|
||||
kinda
|
||||
seriously
|
||||
btw
|
||||
she's
|
||||
hah
|
||||
aww
|
||||
ffs
|
||||
it'd
|
||||
that'd
|
||||
hopefully
|
||||
non
|
||||
entirely
|
||||
lots
|
||||
entire
|
||||
tend
|
||||
hullo
|
||||
clearly
|
||||
surely
|
||||
weird
|
||||
start
|
||||
help
|
||||
nope
|
20
lib/twitter_ebooks.rb
Normal file
20
lib/twitter_ebooks.rb
Normal file
|
@ -0,0 +1,20 @@
|
|||
gem 'minitest'
|
||||
|
||||
def log(*args)
|
||||
STDERR.puts args.map(&:to_s).join(' ')
|
||||
STDERR.flush
|
||||
end
|
||||
|
||||
module Ebooks
|
||||
GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
|
||||
DATA_PATH = File.join(GEM_PATH, 'data')
|
||||
SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
|
||||
TEST_PATH = File.join(GEM_PATH, 'test')
|
||||
TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
|
||||
end
|
||||
|
||||
require 'twitter_ebooks/nlp'
|
||||
require 'twitter_ebooks/archiver'
|
||||
require 'twitter_ebooks/markov'
|
||||
require 'twitter_ebooks/model'
|
||||
require 'twitter_ebooks/bot'
|
82
lib/twitter_ebooks/archiver.rb
Normal file
82
lib/twitter_ebooks/archiver.rb
Normal file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env ruby
|
||||
# encoding: utf-8
|
||||
|
||||
require 'twitter'
|
||||
|
||||
module Ebooks
|
||||
class Archiver
|
||||
def initialize(username, outpath)
|
||||
@username = username
|
||||
@outpath = outpath
|
||||
@client = Twitter::Client.new
|
||||
end
|
||||
|
||||
# Read exiting corpus into memory.
|
||||
# Return list of tweet lines and the last tweet id.
|
||||
def read_corpus
|
||||
lines = []
|
||||
since_id = nil
|
||||
|
||||
if File.exists?(@outpath)
|
||||
lines = File.read(@outpath).split("\n")
|
||||
if lines[0].start_with?('#')
|
||||
since_id = lines[0].split('# ').last
|
||||
end
|
||||
end
|
||||
|
||||
[lines, since_id]
|
||||
end
|
||||
|
||||
# Retrieve all available tweets for a given user since the last tweet id
|
||||
def tweets_since(since_id)
|
||||
page = 1
|
||||
retries = 0
|
||||
tweets = []
|
||||
max_id = nil
|
||||
|
||||
opts = {
|
||||
count: 200,
|
||||
include_rts: false,
|
||||
trim_user: true
|
||||
}
|
||||
|
||||
opts[:since_id] = since_id unless since_id.nil?
|
||||
|
||||
loop do
|
||||
opts[:max_id] = max_id unless max_id.nil?
|
||||
new = @client.user_timeline(@username, opts)
|
||||
break if new.length <= 1
|
||||
puts "Received #{new.length} tweets"
|
||||
tweets += new
|
||||
max_id = new.last.id
|
||||
break
|
||||
end
|
||||
|
||||
tweets
|
||||
end
|
||||
|
||||
def fetch_tweets
|
||||
lines, since_id = read_corpus
|
||||
|
||||
if since_id.nil?
|
||||
puts "Retrieving tweets from @#{@username}"
|
||||
else
|
||||
puts "Retrieving tweets from @#{@username} since #{since_id}"
|
||||
end
|
||||
|
||||
tweets = tweets_since(since_id)
|
||||
|
||||
if tweets.length == 0
|
||||
puts "No new tweets"
|
||||
return
|
||||
end
|
||||
|
||||
new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
|
||||
new_since_id = tweets[0].id.to_s
|
||||
lines = ["# " + new_since_id] + new_lines + lines
|
||||
corpus = File.open(@outpath, 'w')
|
||||
corpus.write(lines.join("\n"))
|
||||
corpus.close
|
||||
end
|
||||
end
|
||||
end
|
164
lib/twitter_ebooks/bot.rb
Normal file
164
lib/twitter_ebooks/bot.rb
Normal file
|
@ -0,0 +1,164 @@
|
|||
#!/usr/bin/env ruby
|
||||
require 'twitter'
|
||||
require 'tweetstream'
|
||||
require 'rufus/scheduler'
|
||||
|
||||
module Ebooks
|
||||
class Bot
|
||||
attr_accessor :consumer_key, :consumer_secret,
|
||||
:oauth_token, :oauth_token_secret
|
||||
|
||||
attr_accessor :username
|
||||
|
||||
attr_reader :twitter, :stream
|
||||
|
||||
@@all = [] # List of all defined bots
|
||||
def self.all; @@all; end
|
||||
|
||||
def self.get(name)
|
||||
all.find { |bot| bot.username == name }
|
||||
end
|
||||
|
||||
def initialize(username, &b)
|
||||
# Set defaults
|
||||
@username = username
|
||||
|
||||
# Override with callback
|
||||
b.call(self)
|
||||
|
||||
Bot.all.push(self)
|
||||
end
|
||||
|
||||
def log(*args)
|
||||
STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
|
||||
STDERR.flush
|
||||
end
|
||||
|
||||
def configure
|
||||
TweetStream.configure do |config|
|
||||
config.consumer_key = @consumer_key
|
||||
config.consumer_secret = @consumer_secret
|
||||
config.oauth_token = @oauth_token
|
||||
config.oauth_token_secret = @oauth_token_secret
|
||||
end
|
||||
|
||||
Twitter.configure do |config|
|
||||
config.consumer_key = @consumer_key
|
||||
config.consumer_secret = @consumer_secret
|
||||
config.oauth_token = @oauth_token
|
||||
config.oauth_token_secret = @oauth_token_secret
|
||||
end
|
||||
|
||||
@twitter = Twitter::Client.new
|
||||
@stream = TweetStream::Client.new
|
||||
end
|
||||
|
||||
# Connects to tweetstream and opens event handlers for this bot
|
||||
def start
|
||||
configure
|
||||
|
||||
@on_startup.call if @on_startup
|
||||
|
||||
@stream.on_error do |msg|
|
||||
log "ERROR: #{msg}"
|
||||
end
|
||||
|
||||
@stream.on_inited do
|
||||
log "Online!"
|
||||
end
|
||||
|
||||
@stream.on_event(:follow) do |event|
|
||||
next if event[:source][:screen_name] == @username
|
||||
log "Followed by #{event[:source][:screen_name]}"
|
||||
@on_follow.call(event[:source])
|
||||
end
|
||||
|
||||
@stream.on_direct_message do |dm|
|
||||
next if dm[:sender][:screen_name] == @username # Don't reply to self
|
||||
log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
|
||||
@on_message.call(dm)
|
||||
end
|
||||
|
||||
@stream.userstream do |ev|
|
||||
next unless ev[:text] # If it's not a text-containing tweet, ignore it
|
||||
next if ev[:user][:screen_name] == @username # Ignore our own tweets
|
||||
|
||||
meta = {}
|
||||
mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
|
||||
|
||||
reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
|
||||
reply_mentions = [ev[:user][:screen_name]] + reply_mentions
|
||||
|
||||
meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
|
||||
meta[:limit] = 140 - meta[:reply_prefix].length
|
||||
|
||||
mless = ev[:text]
|
||||
begin
|
||||
ev.attrs[:entities][:user_mentions].reverse.each do |entity|
|
||||
mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]...-1]
|
||||
end
|
||||
rescue Exception
|
||||
p ev.attrs[:entities][:user_mentions]
|
||||
p ev[:text]
|
||||
raise
|
||||
end
|
||||
meta[:mentionless] = mless
|
||||
|
||||
# To check if this is a mention, ensure:
|
||||
# - The tweet mentions list contains our username
|
||||
# - The tweet is not being retweeted by somebody else
|
||||
# - Or soft-retweeted by somebody else
|
||||
if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
|
||||
log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
|
||||
@on_mention.call(ev, meta)
|
||||
else
|
||||
@on_timeline.call(ev, meta)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Wrapper for EM.add_timer
|
||||
# Delays add a greater sense of humanity to bot behaviour
|
||||
def delay(time, &b)
|
||||
time = time.to_a.sample unless time.is_a? Integer
|
||||
EM.add_timer(time, &b)
|
||||
end
|
||||
|
||||
# Reply to a tweet or a DM.
|
||||
# Applies configurable @reply_delay range
|
||||
def reply(ev, text, opts={})
|
||||
opts = opts.clone
|
||||
delay = @reply_delay.to_a.sample
|
||||
|
||||
if ev.is_a? Twitter::DirectMessage
|
||||
log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
|
||||
@twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
|
||||
elsif ev.is_a? Twitter::Tweet
|
||||
log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
|
||||
@twitter.update(text, in_reply_to_status_id: ev[:id])
|
||||
else
|
||||
raise Exception("Don't know how to reply to a #{ev.class}")
|
||||
end
|
||||
end
|
||||
|
||||
def scheduler
|
||||
@scheduler ||= Rufus::Scheduler.new
|
||||
end
|
||||
|
||||
def follow(*args)
|
||||
log "Following #{args}"
|
||||
@twitter.follow(*args)
|
||||
end
|
||||
|
||||
def tweet(*args)
|
||||
log "Tweeting #{args.inspect}"
|
||||
@twitter.update(*args)
|
||||
end
|
||||
|
||||
def on_startup(&b); @on_startup = b; end
|
||||
def on_follow(&b); @on_follow = b; end
|
||||
def on_mention(&b); @on_mention = b; end
|
||||
def on_timeline(&b); @on_timeline = b; end
|
||||
def on_message(&b); @on_message = b; end
|
||||
end
|
||||
end
|
81
lib/twitter_ebooks/markov.rb
Normal file
81
lib/twitter_ebooks/markov.rb
Normal file
|
@ -0,0 +1,81 @@
|
|||
module Ebooks
|
||||
# Special INTERIM token represents sentence boundaries
|
||||
# This is so we can include start and end of statements in model
|
||||
# Due to the way the sentence tokenizer works, can correspond
|
||||
# to multiple actual parts of text (such as ^, $, \n and .?!)
|
||||
INTERIM = :interim
|
||||
|
||||
# This is an ngram-based Markov model optimized to build from a
|
||||
# tokenized sentence list without requiring too much transformation
|
||||
class MarkovModel
|
||||
def self.build(sentences)
|
||||
MarkovModel.new.consume(sentences)
|
||||
end
|
||||
|
||||
def consume(sentences)
|
||||
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
|
||||
# We map by both bigrams and unigrams so we can fall back to the latter in
|
||||
# cases where an input bigram is unavailable, such as starting a sentence
|
||||
@sentences = sentences
|
||||
@unigrams = {}
|
||||
@bigrams = {}
|
||||
|
||||
sentences.each_with_index do |tokens, i|
|
||||
last_token = INTERIM
|
||||
tokens.each_with_index do |token, j|
|
||||
@unigrams[last_token] ||= []
|
||||
@unigrams[last_token] << [i, j]
|
||||
|
||||
@bigrams[last_token] ||= {}
|
||||
@bigrams[last_token][token] ||= []
|
||||
|
||||
if j == tokens.length-1 # Mark sentence endings
|
||||
@unigrams[token] ||= []
|
||||
@unigrams[token] << INTERIM
|
||||
@bigrams[last_token][token] << INTERIM
|
||||
else
|
||||
@bigrams[last_token][token] << [i, j+1]
|
||||
end
|
||||
|
||||
last_token = token
|
||||
end
|
||||
end
|
||||
|
||||
self
|
||||
end
|
||||
|
||||
def find_token(index)
|
||||
if index == INTERIM
|
||||
INTERIM
|
||||
else
|
||||
@sentences[index[0]][index[1]]
|
||||
end
|
||||
end
|
||||
|
||||
def chain(tokens)
|
||||
if tokens.length == 1
|
||||
matches = @unigrams[tokens[0]]
|
||||
else
|
||||
matches = @bigrams[tokens[-2]][tokens[-1]]
|
||||
end
|
||||
|
||||
if matches.empty?
|
||||
# This should never happen unless a strange token is
|
||||
# supplied from outside the dataset
|
||||
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
|
||||
end
|
||||
|
||||
next_token = find_token(matches.sample)
|
||||
|
||||
if next_token == INTERIM # We chose to end the sentence
|
||||
return tokens
|
||||
else
|
||||
return chain(tokens + [next_token])
|
||||
end
|
||||
end
|
||||
|
||||
def generate
|
||||
NLP.reconstruct(chain([INTERIM]))
|
||||
end
|
||||
end
|
||||
end
|
120
lib/twitter_ebooks/model.rb
Normal file
120
lib/twitter_ebooks/model.rb
Normal file
|
@ -0,0 +1,120 @@
|
|||
#!/usr/bin/env ruby
|
||||
# encoding: utf-8
|
||||
|
||||
require 'json'
|
||||
require 'set'
|
||||
require 'digest/md5'
|
||||
|
||||
module Ebooks
|
||||
class Model
|
||||
attr_accessor :hash, :sentences, :markov, :keywords
|
||||
|
||||
def self.consume(txtpath)
|
||||
Model.new.consume(txtpath)
|
||||
end
|
||||
|
||||
def self.load(path)
|
||||
Marshal.load(File.read(path))
|
||||
end
|
||||
|
||||
def consume(txtpath)
|
||||
# Record hash of source file so we know to update later
|
||||
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
||||
|
||||
text = File.read(txtpath)
|
||||
log "Removing commented lines and mention tokens"
|
||||
|
||||
lines = text.split("\n")
|
||||
keeping = []
|
||||
lines.each do |l|
|
||||
next if l.start_with?('#') || l.include?('RT')
|
||||
processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
|
||||
keeping << processed.join(' ')
|
||||
end
|
||||
text = NLP.normalize(keeping.join("\n"))
|
||||
|
||||
log "Segmenting text into sentences"
|
||||
|
||||
sentences = NLP.sentences(text)
|
||||
|
||||
log "Tokenizing #{sentences.length} sentences"
|
||||
@sentences = sentences.map { |sent| NLP.tokenize(sent) }
|
||||
|
||||
log "Ranking keywords"
|
||||
@keywords = NLP.keywords(@sentences)
|
||||
|
||||
self
|
||||
end
|
||||
|
||||
def save(path)
|
||||
File.open(path, 'w') do |f|
|
||||
f.write(Marshal.dump(self))
|
||||
end
|
||||
self
|
||||
end
|
||||
|
||||
def fix(tweet)
|
||||
# This seems to require an external api call
|
||||
#begin
|
||||
# fixer = NLP.gingerice.parse(tweet)
|
||||
# log fixer if fixer['corrections']
|
||||
# tweet = fixer['result']
|
||||
#rescue Exception => e
|
||||
# log e.message
|
||||
# log e.backtrace
|
||||
#end
|
||||
|
||||
NLP.htmlentities.decode tweet
|
||||
end
|
||||
|
||||
def markov_statement(limit=140, markov=nil)
|
||||
markov ||= MarkovModel.build(@sentences)
|
||||
tweet = ""
|
||||
|
||||
while (tweet = markov.generate) do
|
||||
next if tweet.length > limit
|
||||
next if NLP.unmatched_enclosers?(tweet)
|
||||
break if tweet.length > limit*0.4 || rand > 0.8
|
||||
end
|
||||
|
||||
fix tweet
|
||||
end
|
||||
|
||||
# Finds all relevant tokenized sentences to given input by
|
||||
# comparing non-stopword token overlaps
|
||||
def relevant_sentences(input)
|
||||
relevant = []
|
||||
slightly_relevant = []
|
||||
|
||||
tokenized = NLP.tokenize(input)
|
||||
|
||||
@sentences.each do |sent|
|
||||
tokenized.each do |token|
|
||||
if sent.include?(token)
|
||||
relevant << sent unless NLP.stopword?(token)
|
||||
slightly_relevant << sent
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
[relevant, slightly_relevant]
|
||||
end
|
||||
|
||||
# Generates a response by looking for related sentences
|
||||
# in the corpus and building a smaller markov model from these
|
||||
def markov_response(input, limit=140)
|
||||
# First try
|
||||
relevant, slightly_relevant = relevant_sentences(input)
|
||||
|
||||
if relevant.length >= 3
|
||||
markov = MarkovModel.new.consume(relevant)
|
||||
markov_statement(limit, markov)
|
||||
elsif slightly_relevant.length > 5
|
||||
markov = MarkovModel.new.consume(slightly_relevant)
|
||||
markov_statement(limit, markov)
|
||||
else
|
||||
markov_statement(limit)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
154
lib/twitter_ebooks/nlp.rb
Normal file
154
lib/twitter_ebooks/nlp.rb
Normal file
|
@ -0,0 +1,154 @@
|
|||
# encoding: utf-8
|
||||
require 'fast-stemmer'
|
||||
require 'highscore'
|
||||
|
||||
module Ebooks
|
||||
module NLP
|
||||
# We deliberately limit our punctuation handling to stuff we can do consistently
|
||||
# It'll just be a part of another token if we don't split it out, and that's fine
|
||||
PUNCTUATION = ".?!,"
|
||||
|
||||
# Lazy-load NLP libraries and resources
|
||||
# Some of this stuff is pretty heavy and we don't necessarily need
|
||||
# to be using it all of the time
|
||||
|
||||
def self.stopwords
|
||||
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
||||
end
|
||||
|
||||
def self.nouns
|
||||
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
||||
end
|
||||
|
||||
def self.adjectives
|
||||
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
||||
end
|
||||
|
||||
# POS tagger
|
||||
def self.tagger
|
||||
require 'engtagger'
|
||||
@tagger ||= EngTagger.new
|
||||
end
|
||||
|
||||
# Gingerice text correction service
|
||||
def self.gingerice
|
||||
require 'gingerice'
|
||||
Gingerice::Parser.new # No caching for this one
|
||||
end
|
||||
|
||||
# For decoding html entities
|
||||
def self.htmlentities
|
||||
require 'htmlentities'
|
||||
@htmlentities ||= HTMLEntities.new
|
||||
end
|
||||
|
||||
### Utility functions
|
||||
|
||||
# We don't really want to deal with all this weird unicode punctuation
|
||||
def self.normalize(text)
|
||||
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
||||
end
|
||||
|
||||
# Split text into sentences
|
||||
# We use ad hoc approach because fancy libraries do not deal
|
||||
# especially well with tweet formatting, and we can fake solving
|
||||
# the quote problem during generation
|
||||
def self.sentences(text)
|
||||
text.split(/\n+|(?<=[.?!])\s+/)
|
||||
end
|
||||
|
||||
# Split a sentence into word-level tokens
|
||||
# As above, this is ad hoc because tokenization libraries
|
||||
# do not behave well wrt. things like emoticons and timestamps
|
||||
def self.tokenize(sentence)
|
||||
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
|
||||
sentence.split(regex)
|
||||
end
|
||||
|
||||
def self.stem(word)
|
||||
Stemmer::stem_word(word.downcase)
|
||||
end
|
||||
|
||||
def self.keywords(sentences)
|
||||
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
||||
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
|
||||
|
||||
text = Highscore::Content.new(text)
|
||||
|
||||
text.configure do
|
||||
#set :multiplier, 2
|
||||
#set :upper_case, 3
|
||||
#set :long_words, 2
|
||||
#set :long_words_threshold, 15
|
||||
#set :vowels, 1 # => default: 0 = not considered
|
||||
#set :consonants, 5 # => default: 0 = not considered
|
||||
#set :ignore_case, true # => default: false
|
||||
set :word_pattern, /(?<!@)(?<=\s)[\w']+/ # => default: /\w+/
|
||||
#set :stemming, true # => default: false
|
||||
end
|
||||
|
||||
text.keywords
|
||||
end
|
||||
|
||||
# Takes a list of tokens and builds a nice-looking sentence
|
||||
def self.reconstruct(tokens)
|
||||
text = ""
|
||||
last_token = nil
|
||||
tokens.each do |token|
|
||||
next if token == INTERIM
|
||||
text += ' ' if last_token && space_between?(last_token, token)
|
||||
text += token
|
||||
last_token = token
|
||||
end
|
||||
text
|
||||
end
|
||||
|
||||
# Determine if we need to insert a space between two tokens
|
||||
def self.space_between?(token1, token2)
|
||||
p1 = self.punctuation?(token1)
|
||||
p2 = self.punctuation?(token2)
|
||||
if p1 && p2 # "foo?!"
|
||||
false
|
||||
elsif !p1 && p2 # "foo."
|
||||
false
|
||||
elsif p1 && !p2 # "foo. rah"
|
||||
true
|
||||
else # "foo rah"
|
||||
true
|
||||
end
|
||||
end
|
||||
|
||||
def self.punctuation?(token)
|
||||
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
||||
end
|
||||
|
||||
def self.stopword?(token)
|
||||
@stopword_set ||= stopwords.map(&:downcase).to_set
|
||||
@stopword_set.include?(token.downcase)
|
||||
end
|
||||
|
||||
# Determine if a sample of text contains unmatched brackets or quotes
|
||||
# This is one of the more frequent and noticeable failure modes for
|
||||
# the markov generator; we can just tell it to retry
|
||||
def self.unmatched_enclosers?(text)
|
||||
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
||||
enclosers.each do |pair|
|
||||
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
|
||||
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
|
||||
|
||||
opened = 0
|
||||
|
||||
tokenize(text).each do |token|
|
||||
opened += 1 if token.match(starter)
|
||||
opened -= 1 if token.match(ender)
|
||||
|
||||
return true if opened < 0 # Too many ends!
|
||||
end
|
||||
|
||||
return true if opened != 0 # Mismatch somewhere.
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
3
lib/twitter_ebooks/version.rb
Normal file
3
lib/twitter_ebooks/version.rb
Normal file
|
@ -0,0 +1,3 @@
|
|||
module Ebooks
|
||||
VERSION = "2.0.7"
|
||||
end
|
19
script/process_anc_data.rb
Executable file
19
script/process_anc_data.rb
Executable file
|
@ -0,0 +1,19 @@
|
|||
#!/usr/bin/env ruby
|
||||
# encoding: utf-8
|
||||
|
||||
require 'json'
|
||||
|
||||
freqmap = {}
|
||||
|
||||
data = File.read("data/ANC-all-count.txt")
|
||||
data = data.unpack("C*").pack("U*")
|
||||
|
||||
data.lines.each do |l|
|
||||
vals = l.split("\t")
|
||||
|
||||
freqmap[vals[0]] = vals[-1].to_i
|
||||
end
|
||||
|
||||
File.open("data/wordfreq.json", 'w') do |f|
|
||||
f.write(JSON.dump(freqmap))
|
||||
end
|
1
skeleton/.gitignore
vendored
Normal file
1
skeleton/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
corpus/*
|
1
skeleton/Procfile
Normal file
1
skeleton/Procfile
Normal file
|
@ -0,0 +1 @@
|
|||
worker: ruby run.rb start
|
41
skeleton/bots.rb
Normal file
41
skeleton/bots.rb
Normal file
|
@ -0,0 +1,41 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require 'twitter_ebooks'
|
||||
|
||||
# This is an example bot definition with event handlers commented out
|
||||
# You can define as many of these as you like; they will run simultaneously
|
||||
|
||||
Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
|
||||
# Consumer details come from registering an app at https://dev.twitter.com/
|
||||
# OAuth details can be fetched with https://github.com/marcel/twurl
|
||||
bot.consumer_key = "" # Your app consumer key
|
||||
bot.consumer_secret = "" # Your app consumer secret
|
||||
bot.oauth_token = "" # Token connecting the app to this account
|
||||
bot.oauth_token_secret = "" # Secret connecting the app to this account
|
||||
|
||||
bot.on_message do |dm|
|
||||
# Reply to a DM
|
||||
# bot.reply(dm, "secret secrets")
|
||||
end
|
||||
|
||||
bot.on_follow do |user|
|
||||
# Follow a user back
|
||||
# bot.follow(user[:screen_name])
|
||||
end
|
||||
|
||||
bot.on_mention do |tweet, meta|
|
||||
# Reply to a mention
|
||||
# bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
|
||||
end
|
||||
|
||||
bot.on_timeline do |tweet, meta|
|
||||
# Reply to a tweet in the bot's timeline
|
||||
# bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
|
||||
end
|
||||
|
||||
bot.scheduler.every '24h' do
|
||||
# Tweet something every 24 hours
|
||||
# See https://github.com/jmettraux/rufus-scheduler
|
||||
# bot.tweet("hi")
|
||||
end
|
||||
end
|
9
skeleton/run.rb
Executable file
9
skeleton/run.rb
Executable file
|
@ -0,0 +1,9 @@
|
|||
#!/usr/bin/env ruby
|
||||
|
||||
require_relative 'bots'
|
||||
|
||||
EM.run do
|
||||
Ebooks::Bot.all.each do |bot|
|
||||
bot.start
|
||||
end
|
||||
end
|
14696
test/corpus/0xabad1dea.tweets
Normal file
14696
test/corpus/0xabad1dea.tweets
Normal file
File diff suppressed because it is too large
Load diff
18
test/keywords.rb
Executable file
18
test/keywords.rb
Executable file
|
@ -0,0 +1,18 @@
|
|||
#!/usr/bin/env ruby
|
||||
# encoding: utf-8
|
||||
|
||||
require 'twitter_ebooks'
|
||||
require 'minitest/autorun'
|
||||
require 'benchmark'
|
||||
|
||||
module Ebooks
|
||||
class TestKeywords < Minitest::Test
|
||||
corpus = NLP.normalize(File.read(ARGV[0]))
|
||||
puts "Finding and ranking keywords"
|
||||
puts Benchmark.measure {
|
||||
NLP.keywords(corpus).top(50).each do |keyword|
|
||||
puts "#{keyword.text} #{keyword.weight}"
|
||||
end
|
||||
}
|
||||
end
|
||||
end
|
18
test/tokenize.rb
Executable file
18
test/tokenize.rb
Executable file
|
@ -0,0 +1,18 @@
|
|||
#!/usr/bin/env ruby
|
||||
# encoding: utf-8
|
||||
|
||||
require 'twitter_ebooks'
|
||||
require 'minitest/autorun'
|
||||
|
||||
module Ebooks
|
||||
class TestTokenize < Minitest::Test
|
||||
corpus = NLP.normalize(File.read(TEST_CORPUS_PATH))
|
||||
sents = NLP.sentences(corpus).sample(10)
|
||||
|
||||
NLP.sentences(corpus).sample(10).each do |sent|
|
||||
p sent
|
||||
p NLP.tokenize(sent)
|
||||
puts
|
||||
end
|
||||
end
|
||||
end
|
28
twitter_ebooks.gemspec
Normal file
28
twitter_ebooks.gemspec
Normal file
|
@ -0,0 +1,28 @@
|
|||
# -*- encoding: utf-8 -*-
|
||||
require File.expand_path('../lib/twitter_ebooks/version', __FILE__)
|
||||
|
||||
Gem::Specification.new do |gem|
|
||||
gem.authors = ["Jaiden Mispy"]
|
||||
gem.email = ["^_^@mispy.me"]
|
||||
gem.description = %q{Markov chains for all your friends~}
|
||||
gem.summary = %q{Markov chains for all your friends~}
|
||||
gem.homepage = ""
|
||||
|
||||
gem.files = `git ls-files`.split($\)
|
||||
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
||||
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
||||
gem.name = "twitter_ebooks"
|
||||
gem.require_paths = ["lib"]
|
||||
gem.version = Ebooks::VERSION
|
||||
|
||||
gem.add_runtime_dependency 'minitest'
|
||||
|
||||
gem.add_runtime_dependency 'twitter'
|
||||
gem.add_runtime_dependency 'tweetstream'
|
||||
gem.add_runtime_dependency 'rufus-scheduler'
|
||||
gem.add_runtime_dependency 'gingerice'
|
||||
gem.add_runtime_dependency 'htmlentities'
|
||||
gem.add_runtime_dependency 'engtagger'
|
||||
gem.add_runtime_dependency 'fast-stemmer'
|
||||
gem.add_runtime_dependency 'highscore'
|
||||
end
|
Loading…
Add table
Add a link
Reference in a new issue