Github time!

2013-11-08 06:02:05 +11:00 · 2013-11-08 06:02:05 +11:00 · e87dc5862b
commit e87dc5862b
27 changed files with 20178 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.*.swp
+pkg
--- a/4
+++ b/4
@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+
+# Specify your gem's dependencies in libtcod.gemspec
+gemspec
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -0,0 +1,78 @@
+PATH
+  remote: .
+  specs:
+    twitter_ebooks (2.0.3)
+      bloomfilter-rb
+      engtagger
+      fast-stemmer
+      gingerice
+      highscore
+      htmlentities
+      minitest
+      rufus-scheduler
+      tweetstream
+      twitter
+
+GEM
+  remote: https://rubygems.org/
+  specs:
+    addressable (2.3.5)
+    atomic (1.1.14)
+    awesome_print (1.2.0)
+    bloomfilter-rb (2.1.1)
+      redis
+    cookiejar (0.3.0)
+    daemons (1.1.9)
+    em-http-request (1.0.3)
+      addressable (>= 2.2.3)
+      cookiejar
+      em-socksify
+      eventmachine (>= 1.0.0.beta.4)
+      http_parser.rb (>= 0.5.3)
+    em-socksify (0.3.0)
+      eventmachine (>= 1.0.0.beta.4)
+    em-twitter (0.2.2)
+      eventmachine (~> 1.0)
+      http_parser.rb (~> 0.5)
+      simple_oauth (~> 0.1)
+    engtagger (0.1.2)
+    eventmachine (1.0.3)
+    faraday (0.8.8)
+      multipart-post (~> 1.2.0)
+    fast-stemmer (1.0.2)
+    gingerice (1.2.1)
+      addressable
+      awesome_print
+    highscore (1.1.0)
+      whatlanguage (>= 1.0.0)
+    htmlentities (4.3.1)
+    http_parser.rb (0.5.3)
+    minitest (5.0.8)
+    multi_json (1.8.2)
+    multipart-post (1.2.0)
+    redis (3.0.5)
+    rufus-scheduler (3.0.2)
+      tzinfo
+    simple_oauth (0.2.0)
+    thread_safe (0.1.3)
+      atomic
+    tweetstream (2.5.0)
+      daemons (~> 1.1)
+      em-http-request (~> 1.0.2)
+      em-twitter (~> 0.2)
+      twitter (~> 4.5)
+      yajl-ruby (~> 1.1)
+    twitter (4.8.1)
+      faraday (~> 0.8, < 0.10)
+      multi_json (~> 1.0)
+      simple_oauth (~> 0.2)
+    tzinfo (1.1.0)
+      thread_safe (~> 0.1)
+    whatlanguage (1.0.5)
+    yajl-ruby (1.1.0)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  twitter_ebooks!
--- a/22
+++ b/22
@ -0,0 +1,22 @@
+Copyright (c) 2013 Jaiden Mispy
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/NOTES.md
+++ b/NOTES.md
@ -0,0 +1,4 @@
+- Files in text/ are preprocessed by `rake consume` and serialized
+- e.g. text/foo.tweets becomes consumed/foo.corpus
+- `rake consume` looks at hashes to know which it needs to update
+- Preprocessed corpus files are loaded at runtime by Corpus.load('foo')
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
+# twitter\_ebooks 2.0.7
+
+Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
+
+## Installation
+
+```bash
+gem install twitter_ebooks
+```
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
--- a/bin/ebooks
+++ b/bin/ebooks
@ -0,0 +1,100 @@
+#!/usr/bin/env ruby
+
+require 'twitter_ebooks'
+
+module Ebooks
+  APP_PATH = Dir.pwd # XXX do some recursive thing instead
+
+  def self.new(target)
+    usage = "Usage: ebooks new <reponame>"
+
+    if target.nil?
+      log usage
+      exit
+    end
+
+    target = "./#{reponame}"
+
+    if File.exists?(target)
+      log "#{target} already exists. Please remove if you want to recreate."
+      exit
+    end
+
+    FileUtils.cp_r(SKELETON_PATH, target) 
+
+    File.open(File.join(target, 'bots.rb'), 'w') do |f|
+      template = File.read(File.join(SKELETON_PATH, 'bots.rb'))
+      f.write(template.gsub("{{BOT_NAME}}", reponame))
+    end
+
+    log "New twitter_ebooks app created at #{target}"
+  end
+
+  def self.consume(pathes)
+    pathes.each do |path|
+      filename = File.basename(path)
+      shortname = filename.split('.')[0..-2].join('.')
+      hash = Digest::MD5.hexdigest(File.read(path))
+
+      log "Consuming text corpus: #{filename}"
+      outpath = File.join(APP_PATH, 'model', "#{shortname}.model")
+      Model.consume(path).save(outpath)
+      log "Corpus consumed"
+    end
+  end
+
+  def self.gen(model_path, input)
+    model = Model.load(model_path)
+    if input && !input.empty?
+      puts "@cmd " + model.markov_response(input, 135)
+    else
+      puts model.markov_statement
+    end
+  end
+
+  def self.score(model_path, input)
+    model = Model.load(model_path)
+    model.score_interest(input)
+  end
+
+  def self.archive(username, outpath)
+    Archiver.new(username, outpath).fetch_tweets
+  end
+
+  def self.tweet(modelpath, username)
+    load File.join(APP_PATH, 'bots.rb')
+    model = Model.load(modelpath)
+    statement = model.markov_statement
+    log "@#{username}: #{statement}"
+    bot = Bot.get(username)
+    bot.configure
+    bot.tweet(statement)
+  end
+
+  def self.command(args)
+    usage = """Usage: 
+     ebooks new <reponame>
+     ebooks consume <corpus_path> [...]
+     ebooks gen <model_path> [input]
+     ebooks score <model_path> <input>
+     ebooks archive <@user> <outpath>
+     ebooks tweet <model_path> <@bot>
+"""
+
+    if args.length == 0
+      log usage
+      exit
+    end
+
+    case args[0]
+    when "new" then new(args[1])
+    when "consume" then consume(args[1..-1])
+    when "gen" then gen(args[1], args[2..-1].join(' '))
+    when "score" then score(args[1], args[2..-1].join(' '))
+    when "archive" then archive(args[1], args[2])
+    when "tweet" then tweet(args[1], args[2])
+    end
+  end
+end
+
+Ebooks.command(ARGV)
--- a/data/adjectives.txt
+++ b/data/adjectives.txt
--- a/data/nouns.txt
+++ b/data/nouns.txt
--- a/data/stopwords.txt
+++ b/data/stopwords.txt
@ -0,0 +1,843 @@
+a
+able
+about
+above
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+added
+adj
+affected
+affecting
+affects
+after
+afterwards
+again
+against
+ah
+all
+almost
+alone
+along
+already
+also
+although
+always
+am
+among
+amongst
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+apparently
+approximately
+are
+aren
+arent
+arise
+around
+as
+aside
+ask
+asking
+at
+auth
+available
+away
+awfully
+b
+back
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+begin
+beginning
+beginnings
+begins
+behind
+being
+believe
+below
+beside
+besides
+between
+beyond
+biol
+both
+brief
+briefly
+but
+by
+c
+ca
+came
+can
+cannot
+can't
+cause
+causes
+certain
+certainly
+co
+com
+come
+comes
+contain
+containing
+contains
+could
+couldnt
+d
+date
+did
+didn't
+different
+do
+does
+doesn't
+doing
+done
+don't
+down
+downwards
+due
+during
+e
+each
+ed
+edu
+effect
+eg
+eight
+eighty
+either
+else
+elsewhere
+end
+ending
+enough
+especially
+et
+et-al
+etc
+even
+ever
+every
+everybody
+everyone
+everything
+everywhere
+ex
+except
+f
+far
+few
+ff
+fifth
+first
+five
+fix
+followed
+following
+follows
+for
+former
+formerly
+forth
+found
+four
+from
+further
+furthermore
+g
+gave
+get
+gets
+getting
+give
+given
+gives
+giving
+go
+goes
+gone
+got
+gotten
+h
+had
+happens
+hardly
+has
+hasn't
+have
+haven't
+having
+he
+hed
+hence
+her
+here
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+hes
+hi
+hid
+him
+himself
+his
+hither
+home
+how
+howbeit
+however
+hundred
+i
+id
+ie
+if
+i'll
+im
+immediate
+immediately
+importance
+important
+in
+inc
+indeed
+index
+information
+instead
+into
+invention
+inward
+is
+isn't
+it
+itd
+it'll
+its
+itself
+i've
+j
+just
+k
+keep
+keeps
+kept
+kg
+km
+know
+known
+knows
+l
+largely
+last
+lately
+later
+latter
+latterly
+least
+less
+lest
+let
+lets
+like
+liked
+likely
+line
+little
+'ll
+look
+looking
+looks
+ltd
+m
+made
+mainly
+make
+makes
+many
+may
+maybe
+me
+mean
+means
+meantime
+meanwhile
+merely
+mg
+might
+million
+miss
+ml
+more
+moreover
+most
+mostly
+mr
+mrs
+much
+mug
+must
+my
+myself
+n
+na
+name
+namely
+nay
+nd
+near
+nearly
+necessarily
+necessary
+need
+needs
+neither
+never
+nevertheless
+new
+next
+nine
+ninety
+no
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+now
+nowhere
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+omitted
+on
+once
+one
+ones
+only
+onto
+or
+ord
+other
+others
+otherwise
+ought
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+page
+pages
+part
+particular
+particularly
+past
+per
+perhaps
+placed
+please
+plus
+poorly
+possible
+possibly
+potentially
+pp
+predominantly
+present
+previously
+primarily
+probably
+promptly
+proud
+provides
+put
+q
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+respectively
+resulted
+resulting
+results
+right
+run
+s
+said
+same
+saw
+say
+saying
+says
+sec
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+self
+selves
+sent
+seven
+several
+shall
+she
+shed
+she'll
+shes
+should
+shouldn't
+show
+showed
+shown
+showns
+shows
+significant
+significantly
+similar
+similarly
+since
+six
+slightly
+so
+some
+somebody
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+still
+stop
+strongly
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure
+t
+take
+taken
+taking
+tell
+tends
+th
+than
+thank
+thanks
+thanx
+that
+that'll
+thats
+that've
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+thereafter
+thereby
+thered
+therefore
+therein
+there'll
+thereof
+therere
+theres
+thereto
+thereupon
+there've
+these
+they
+theyd
+they'll
+theyre
+they've
+think
+this
+those
+thou
+though
+thoughh
+thousand
+throug
+through
+throughout
+thru
+thus
+til
+tip
+to
+together
+too
+took
+toward
+towards
+tried
+tries
+truly
+try
+trying
+ts
+twice
+two
+u
+un
+under
+unfortunately
+unless
+unlike
+unlikely
+until
+unto
+up
+upon
+ups
+us
+use
+used
+useful
+usefully
+usefulness
+uses
+using
+usually
+v
+value
+various
+'ve
+very
+via
+viz
+vol
+vols
+vs
+w
+want
+wants
+was
+wasn't
+way
+we
+wed
+welcome
+we'll
+went
+were
+weren't
+we've
+what
+whatever
+what'll
+whats
+when
+whence
+whenever
+where
+whereafter
+whereas
+whereby
+wherein
+wheres
+whereupon
+wherever
+whether
+which
+while
+whim
+whither
+who
+whod
+whoever
+whole
+who'll
+whom
+whomever
+whos
+whose
+why
+widely
+willing
+wish
+with
+within
+without
+won't
+words
+world
+would
+wouldn't
+www
+x
+y
+yes
+yet
+you
+youd
+you'll
+your
+youre
+yours
+yourself
+yourselves
+you've
+z
+zero
+.
+?
+!
+
+http
+don
+people
+well
+will
+https
+time
+good
+thing
+twitter
+pretty
+it's
+i'm
+that's
+you're
+they're
+there's
+things
+yeah
+find
+going
+work
+point
+years
+guess
+bad
+problem
+real
+kind
+day
+better
+lot
+stuff
+i'd
+read
+thought
+idea
+case
+word
+hey
+person
+long
+Dear
+internet
+tweet
+he's
+feel
+wrong
+call
+hard
+phone
+ago
+literally
+remember
+reason
+called
+course
+bit
+question
+high
+today
+told
+man
+actual
+year
+three
+book
+assume
+life
+true
+best
+wow
+video
+times
+works
+fact
+completely
+totally
+imo
+open
+lol
+haha
+cool
+yep
+ooh
+great
+ugh
+tonight
+talk
+sounds
+hahaha
+whoa
+cool
+we're
+guys
+sweet
+fortunately
+hmm
+aren't
+sadly
+talking
+you'd
+place
+yup
+what's
+y'know
+basically
+god
+shit
+holy
+interesting
+news
+guy
+wait
+oooh
+gonna
+current
+let's
+tomorrow
+omg
+hate
+hope
+fuck
+oops
+night
+wear
+wanna
+fun
+finally
+whoops
+nevermind
+definitely
+context
+screen
+free
+exactly
+big
+house
+half
+working
+play
+heard
+hmmm
+damn
+woah
+tho
+set
+idk
+sort
+understand
+kinda
+seriously
+btw
+she's
+hah
+aww
+ffs
+it'd
+that'd
+hopefully
+non
+entirely
+lots
+entire
+tend
+hullo
+clearly
+surely
+weird
+start
+help
+nope
--- a/lib/twitter_ebooks.rb
+++ b/lib/twitter_ebooks.rb
@ -0,0 +1,20 @@
+gem 'minitest'
+
+def log(*args)
+  STDERR.puts args.map(&:to_s).join(' ')
+  STDERR.flush
+end
+
+module Ebooks
+  GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+  DATA_PATH = File.join(GEM_PATH, 'data')
+  SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
+  TEST_PATH = File.join(GEM_PATH, 'test')
+  TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
+end
+
+require 'twitter_ebooks/nlp'
+require 'twitter_ebooks/archiver'
+require 'twitter_ebooks/markov'
+require 'twitter_ebooks/model'
+require 'twitter_ebooks/bot'
--- a/lib/twitter_ebooks/archiver.rb
+++ b/lib/twitter_ebooks/archiver.rb
@ -0,0 +1,82 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'twitter'
+
+module Ebooks
+  class Archiver
+    def initialize(username, outpath)
+      @username = username
+      @outpath = outpath
+      @client = Twitter::Client.new
+    end
+
+    # Read exiting corpus into memory.
+    # Return list of tweet lines and the last tweet id.
+    def read_corpus
+      lines = []
+      since_id = nil
+
+      if File.exists?(@outpath)
+        lines = File.read(@outpath).split("\n")
+        if lines[0].start_with?('#')
+          since_id = lines[0].split('# ').last
+        end
+      end
+
+      [lines, since_id]
+    end
+
+    # Retrieve all available tweets for a given user since the last tweet id
+    def tweets_since(since_id)
+      page = 1
+      retries = 0
+      tweets = []
+      max_id = nil
+
+      opts = {
+        count: 200,
+        include_rts: false,
+        trim_user: true
+      }
+
+      opts[:since_id] = since_id unless since_id.nil?
+
+      loop do
+        opts[:max_id] = max_id unless max_id.nil?
+        new = @client.user_timeline(@username, opts)
+        break if new.length <= 1
+        puts "Received #{new.length} tweets"
+        tweets += new
+        max_id = new.last.id
+        break
+      end
+
+      tweets
+    end
+
+    def fetch_tweets
+      lines, since_id = read_corpus
+
+      if since_id.nil?
+        puts "Retrieving tweets from @#{@username}"
+      else
+        puts "Retrieving tweets from @#{@username} since #{since_id}"
+      end
+
+      tweets = tweets_since(since_id)
+
+      if tweets.length == 0
+        puts "No new tweets"
+        return
+      end
+
+      new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
+      new_since_id = tweets[0].id.to_s
+      lines = ["# " + new_since_id] + new_lines + lines
+      corpus = File.open(@outpath, 'w')
+      corpus.write(lines.join("\n"))
+      corpus.close
+    end
+  end
+end
--- a/lib/twitter_ebooks/bot.rb
+++ b/lib/twitter_ebooks/bot.rb
@ -0,0 +1,164 @@
+#!/usr/bin/env ruby
+require 'twitter'
+require 'tweetstream'
+require 'rufus/scheduler'
+
+module Ebooks
+  class Bot
+    attr_accessor :consumer_key, :consumer_secret, 
+                  :oauth_token, :oauth_token_secret
+
+    attr_accessor :username
+
+    attr_reader :twitter, :stream
+
+    @@all = [] # List of all defined bots
+    def self.all; @@all; end
+
+    def self.get(name)
+      all.find { |bot| bot.username == name }
+    end
+
+    def initialize(username, &b)
+      # Set defaults
+      @username = username
+
+      # Override with callback
+      b.call(self)
+
+      Bot.all.push(self)
+    end
+
+    def log(*args)
+      STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
+      STDERR.flush
+    end
+
+    def configure
+      TweetStream.configure do |config|
+        config.consumer_key = @consumer_key
+        config.consumer_secret = @consumer_secret
+        config.oauth_token = @oauth_token
+        config.oauth_token_secret = @oauth_token_secret
+      end
+
+      Twitter.configure do |config|
+        config.consumer_key = @consumer_key
+        config.consumer_secret = @consumer_secret
+        config.oauth_token = @oauth_token
+        config.oauth_token_secret = @oauth_token_secret
+      end
+
+      @twitter = Twitter::Client.new
+      @stream = TweetStream::Client.new
+    end
+
+    # Connects to tweetstream and opens event handlers for this bot
+    def start
+      configure
+
+      @on_startup.call if @on_startup
+
+      @stream.on_error do |msg|
+        log "ERROR: #{msg}"
+      end
+
+      @stream.on_inited do
+        log "Online!"
+      end
+
+      @stream.on_event(:follow) do |event|
+        next if event[:source][:screen_name] == @username
+        log "Followed by #{event[:source][:screen_name]}"
+        @on_follow.call(event[:source])
+      end
+
+      @stream.on_direct_message do |dm|
+        next if dm[:sender][:screen_name] == @username # Don't reply to self
+        log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
+        @on_message.call(dm)
+      end
+
+      @stream.userstream do |ev|
+        next unless ev[:text] # If it's not a text-containing tweet, ignore it
+        next if ev[:user][:screen_name] == @username # Ignore our own tweets
+
+        meta = {}
+        mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
+
+        reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
+        reply_mentions = [ev[:user][:screen_name]] + reply_mentions
+
+        meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
+        meta[:limit] = 140 - meta[:reply_prefix].length
+
+        mless = ev[:text]
+        begin
+          ev.attrs[:entities][:user_mentions].reverse.each do |entity|
+            mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]...-1]
+          end
+        rescue Exception
+          p ev.attrs[:entities][:user_mentions]
+          p ev[:text]
+          raise
+        end
+        meta[:mentionless] = mless
+
+        # To check if this is a mention, ensure:
+        # - The tweet mentions list contains our username
+        # - The tweet is not being retweeted by somebody else
+        # - Or soft-retweeted by somebody else
+        if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
+          log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
+          @on_mention.call(ev, meta)
+        else
+          @on_timeline.call(ev, meta)
+        end
+      end
+    end
+
+    # Wrapper for EM.add_timer
+    # Delays add a greater sense of humanity to bot behaviour
+    def delay(time, &b)
+      time = time.to_a.sample unless time.is_a? Integer
+      EM.add_timer(time, &b)
+    end
+
+    # Reply to a tweet or a DM.
+    # Applies configurable @reply_delay range
+    def reply(ev, text, opts={})
+      opts = opts.clone
+      delay = @reply_delay.to_a.sample
+
+      if ev.is_a? Twitter::DirectMessage
+        log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
+        @twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
+      elsif ev.is_a? Twitter::Tweet
+        log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
+        @twitter.update(text, in_reply_to_status_id: ev[:id]) 
+      else
+        raise Exception("Don't know how to reply to a #{ev.class}")
+      end
+    end
+
+    def scheduler
+      @scheduler ||= Rufus::Scheduler.new
+    end
+
+    def follow(*args)
+      log "Following #{args}"
+      @twitter.follow(*args)
+    end
+
+    def tweet(*args)
+      log "Tweeting #{args.inspect}"
+      @twitter.update(*args)
+    end
+
+    def on_startup(&b); @on_startup = b; end
+    def on_follow(&b); @on_follow = b; end
+    def on_mention(&b); @on_mention = b; end
+    def on_timeline(&b); @on_timeline = b; end
+    def on_message(&b); @on_message = b; end
+  end
+end
--- a/lib/twitter_ebooks/markov.rb
+++ b/lib/twitter_ebooks/markov.rb
@ -0,0 +1,81 @@
+module Ebooks
+  # Special INTERIM token represents sentence boundaries
+  # This is so we can include start and end of statements in model
+  # Due to the way the sentence tokenizer works, can correspond
+  # to multiple actual parts of text (such as ^, $, \n and .?!)
+  INTERIM = :interim
+
+  # This is an ngram-based Markov model optimized to build from a
+  # tokenized sentence list without requiring too much transformation
+  class MarkovModel
+    def self.build(sentences)
+      MarkovModel.new.consume(sentences)
+    end
+
+    def consume(sentences)
+      # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
+      # We map by both bigrams and unigrams so we can fall back to the latter in
+      # cases where an input bigram is unavailable, such as starting a sentence
+      @sentences = sentences
+      @unigrams = {}
+      @bigrams = {}
+
+      sentences.each_with_index do |tokens, i|
+        last_token = INTERIM
+        tokens.each_with_index do |token, j|
+          @unigrams[last_token] ||= []
+          @unigrams[last_token] << [i, j]
+
+          @bigrams[last_token] ||= {}
+          @bigrams[last_token][token] ||= []
+
+          if j == tokens.length-1 # Mark sentence endings
+            @unigrams[token] ||= []
+            @unigrams[token] << INTERIM
+            @bigrams[last_token][token] << INTERIM
+          else
+            @bigrams[last_token][token] << [i, j+1]
+          end
+
+          last_token = token
+        end
+      end
+
+      self
+    end
+
+    def find_token(index)
+      if index == INTERIM
+        INTERIM
+      else
+        @sentences[index[0]][index[1]]
+      end
+    end
+
+    def chain(tokens)
+      if tokens.length == 1
+        matches = @unigrams[tokens[0]]
+      else
+        matches = @bigrams[tokens[-2]][tokens[-1]]
+      end
+
+      if matches.empty?
+        # This should never happen unless a strange token is
+        # supplied from outside the dataset
+        raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
+      end
+
+      next_token = find_token(matches.sample)
+
+      if next_token == INTERIM # We chose to end the sentence
+        return tokens
+      else
+        return chain(tokens + [next_token])
+      end
+    end
+
+    def generate
+      NLP.reconstruct(chain([INTERIM]))
+    end
+  end
+end
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -0,0 +1,120 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'json'
+require 'set'
+require 'digest/md5'
+
+module Ebooks
+  class Model
+    attr_accessor :hash, :sentences, :markov, :keywords
+
+    def self.consume(txtpath)
+      Model.new.consume(txtpath)
+    end
+
+    def self.load(path)
+      Marshal.load(File.read(path))
+    end
+
+    def consume(txtpath)
+      # Record hash of source file so we know to update later
+      @hash = Digest::MD5.hexdigest(File.read(txtpath))
+
+      text = File.read(txtpath)
+      log "Removing commented lines and mention tokens"
+
+      lines = text.split("\n")
+      keeping = []
+      lines.each do |l|
+        next if l.start_with?('#') || l.include?('RT')
+        processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
+        keeping << processed.join(' ')
+      end
+      text = NLP.normalize(keeping.join("\n"))
+
+      log "Segmenting text into sentences"
+
+      sentences = NLP.sentences(text)
+
+      log "Tokenizing #{sentences.length} sentences"
+      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
+
+      log "Ranking keywords"
+      @keywords = NLP.keywords(@sentences)
+
+      self
+    end
+
+    def save(path)
+      File.open(path, 'w') do |f|
+        f.write(Marshal.dump(self))
+      end
+      self
+    end
+
+    def fix(tweet)
+      # This seems to require an external api call
+      #begin
+      #  fixer = NLP.gingerice.parse(tweet)
+      #  log fixer if fixer['corrections']
+      #  tweet = fixer['result']
+      #rescue Exception => e
+      #  log e.message
+      #  log e.backtrace
+      #end
+
+      NLP.htmlentities.decode tweet
+    end
+
+    def markov_statement(limit=140, markov=nil)
+      markov ||= MarkovModel.build(@sentences)
+      tweet = ""
+
+      while (tweet = markov.generate) do
+        next if tweet.length > limit
+        next if NLP.unmatched_enclosers?(tweet)
+        break if tweet.length > limit*0.4 || rand > 0.8
+      end
+
+      fix tweet
+    end
+
+    # Finds all relevant tokenized sentences to given input by
+    # comparing non-stopword token overlaps
+    def relevant_sentences(input)
+      relevant = []
+      slightly_relevant = []
+
+      tokenized = NLP.tokenize(input)
+
+      @sentences.each do |sent|
+        tokenized.each do |token|
+          if sent.include?(token)
+            relevant << sent unless NLP.stopword?(token)
+            slightly_relevant << sent
+          end
+        end
+      end
+
+      [relevant, slightly_relevant]
+    end
+
+    # Generates a response by looking for related sentences
+    # in the corpus and building a smaller markov model from these
+    def markov_response(input, limit=140)
+      # First try 
+      relevant, slightly_relevant = relevant_sentences(input)
+
+      if relevant.length >= 3
+        markov = MarkovModel.new.consume(relevant)
+        markov_statement(limit, markov)
+      elsif slightly_relevant.length > 5
+        markov = MarkovModel.new.consume(slightly_relevant)
+        markov_statement(limit, markov)
+      else
+        markov_statement(limit)
+      end
+    end
+  end
+end
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -0,0 +1,154 @@
+# encoding: utf-8
+require 'fast-stemmer'
+require 'highscore'
+
+module Ebooks
+  module NLP
+    # We deliberately limit our punctuation handling to stuff we can do consistently
+    # It'll just be a part of another token if we don't split it out, and that's fine
+    PUNCTUATION = ".?!,"
+
+    # Lazy-load NLP libraries and resources
+    # Some of this stuff is pretty heavy and we don't necessarily need
+    # to be using it all of the time
+
+    def self.stopwords
+      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
+    end
+
+    def self.nouns
+      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
+    end
+
+    def self.adjectives
+      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
+    end
+
+    # POS tagger
+    def self.tagger
+      require 'engtagger'
+      @tagger ||= EngTagger.new
+    end
+
+    # Gingerice text correction service
+    def self.gingerice
+      require 'gingerice'
+      Gingerice::Parser.new # No caching for this one
+    end
+
+    # For decoding html entities
+    def self.htmlentities
+      require 'htmlentities'
+      @htmlentities ||= HTMLEntities.new
+    end
+
+    ### Utility functions
+    
+    # We don't really want to deal with all this weird unicode punctuation
+    def self.normalize(text)
+      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
+    end
+
+    # Split text into sentences
+    # We use ad hoc approach because fancy libraries do not deal
+    # especially well with tweet formatting, and we can fake solving
+    # the quote problem during generation
+    def self.sentences(text)
+      text.split(/\n+|(?<=[.?!])\s+/)
+    end
+
+    # Split a sentence into word-level tokens
+    # As above, this is ad hoc because tokenization libraries
+    # do not behave well wrt. things like emoticons and timestamps
+    def self.tokenize(sentence)
+      regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
+      sentence.split(regex)
+    end
+
+    def self.stem(word)
+      Stemmer::stem_word(word.downcase)
+    end
+
+    def self.keywords(sentences)
+      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
+      text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
+
+      text = Highscore::Content.new(text)
+
+      text.configure do
+        #set :multiplier, 2
+        #set :upper_case, 3
+        #set :long_words, 2
+        #set :long_words_threshold, 15
+        #set :vowels, 1                     # => default: 0 = not considered
+        #set :consonants, 5                 # => default: 0 = not considered
+        #set :ignore_case, true             # => default: false
+        set :word_pattern, /(?<!@)(?<=\s)[\w']+/           # => default: /\w+/
+        #set :stemming, true                # => default: false
+      end
+
+      text.keywords
+    end
+
+    # Takes a list of tokens and builds a nice-looking sentence
+    def self.reconstruct(tokens)
+      text = ""
+      last_token = nil
+      tokens.each do |token|
+        next if token == INTERIM
+        text += ' ' if last_token && space_between?(last_token, token)
+        text += token
+        last_token = token
+      end
+      text
+    end
+
+    # Determine if we need to insert a space between two tokens
+    def self.space_between?(token1, token2)
+      p1 = self.punctuation?(token1)
+      p2 = self.punctuation?(token2)
+      if p1 && p2 # "foo?!"
+        false
+      elsif !p1 && p2 # "foo."
+        false
+      elsif p1 && !p2 # "foo. rah"
+        true
+      else # "foo rah"
+        true
+      end
+    end
+
+    def self.punctuation?(token)
+      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
+    end
+
+    def self.stopword?(token)
+      @stopword_set ||= stopwords.map(&:downcase).to_set
+      @stopword_set.include?(token.downcase)
+    end
+
+    # Determine if a sample of text contains unmatched brackets or quotes
+    # This is one of the more frequent and noticeable failure modes for
+    # the markov generator; we can just tell it to retry
+    def self.unmatched_enclosers?(text)
+      enclosers = ['**', '""', '()', '[]', '``', "''"]
+      enclosers.each do |pair|
+        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
+        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
+
+        opened = 0
+
+        tokenize(text).each do |token|
+          opened += 1 if token.match(starter)
+          opened -= 1 if token.match(ender)
+
+          return true if opened < 0 # Too many ends!
+        end
+
+        return true if opened != 0 # Mismatch somewhere.
+      end
+
+      false
+    end
+  end
+end
--- a/lib/twitter_ebooks/version.rb
+++ b/lib/twitter_ebooks/version.rb
@ -0,0 +1,3 @@
+module Ebooks
+  VERSION = "2.0.7"
+end
--- a/script/process_anc_data.rb
+++ b/script/process_anc_data.rb
@ -0,0 +1,19 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'json'
+
+freqmap = {}
+
+data = File.read("data/ANC-all-count.txt")
+data = data.unpack("C*").pack("U*")
+
+data.lines.each do |l|
+  vals = l.split("\t")
+  
+  freqmap[vals[0]] = vals[-1].to_i
+end
+
+File.open("data/wordfreq.json", 'w') do |f|
+  f.write(JSON.dump(freqmap))
+end
--- a/skeleton/.gitignore
+++ b/skeleton/.gitignore
@ -0,0 +1 @@
+corpus/*
--- a/skeleton/Procfile
+++ b/skeleton/Procfile
@ -0,0 +1 @@
+worker: ruby run.rb start
--- a/skeleton/bots.rb
+++ b/skeleton/bots.rb
@ -0,0 +1,41 @@
+#!/usr/bin/env ruby
+
+require 'twitter_ebooks'
+
+# This is an example bot definition with event handlers commented out
+# You can define as many of these as you like; they will run simultaneously
+
+Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
+  # Consumer details come from registering an app at https://dev.twitter.com/
+  # OAuth details can be fetched with https://github.com/marcel/twurl
+  bot.consumer_key = "" # Your app consumer key
+  bot.consumer_secret = "" # Your app consumer secret
+  bot.oauth_token = "" # Token connecting the app to this account
+  bot.oauth_token_secret = "" # Secret connecting the app to this account
+
+  bot.on_message do |dm|
+    # Reply to a DM
+    # bot.reply(dm, "secret secrets")
+  end
+
+  bot.on_follow do |user|
+    # Follow a user back
+    # bot.follow(user[:screen_name])
+  end
+
+  bot.on_mention do |tweet, meta|
+    # Reply to a mention
+    # bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
+  end
+
+  bot.on_timeline do |tweet, meta|
+    # Reply to a tweet in the bot's timeline
+    # bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
+  end
+
+  bot.scheduler.every '24h' do
+    # Tweet something every 24 hours
+    # See https://github.com/jmettraux/rufus-scheduler
+    # bot.tweet("hi")
+  end
+end
--- a/skeleton/run.rb
+++ b/skeleton/run.rb
@ -0,0 +1,9 @@
+#!/usr/bin/env ruby
+
+require_relative 'bots'
+
+EM.run do
+ Ebooks::Bot.all.each do |bot|
+    bot.start
+  end
+end
--- a/test/corpus/0xabad1dea.tweets
+++ b/test/corpus/0xabad1dea.tweets
--- a/test/keywords.rb
+++ b/test/keywords.rb
@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'twitter_ebooks'
+require 'minitest/autorun'
+require 'benchmark'
+
+module Ebooks
+  class TestKeywords < Minitest::Test
+    corpus = NLP.normalize(File.read(ARGV[0]))
+    puts "Finding and ranking keywords"
+    puts Benchmark.measure {
+      NLP.keywords(corpus).top(50).each do |keyword|
+        puts "#{keyword.text} #{keyword.weight}"
+      end
+    }
+  end
+end
--- a/test/tokenize.rb
+++ b/test/tokenize.rb
@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'twitter_ebooks'
+require 'minitest/autorun'
+
+module Ebooks
+  class TestTokenize < Minitest::Test
+    corpus = NLP.normalize(File.read(TEST_CORPUS_PATH))
+    sents = NLP.sentences(corpus).sample(10)
+
+    NLP.sentences(corpus).sample(10).each do |sent|
+      p sent
+      p NLP.tokenize(sent)
+      puts
+    end
+  end
+end
--- a/twitter_ebooks.gemspec
+++ b/twitter_ebooks.gemspec
@ -0,0 +1,28 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path('../lib/twitter_ebooks/version', __FILE__)
+
+Gem::Specification.new do |gem|
+  gem.authors       = ["Jaiden Mispy"]
+  gem.email         = ["^_^@mispy.me"]
+  gem.description   = %q{Markov chains for all your friends~}
+  gem.summary       = %q{Markov chains for all your friends~}
+  gem.homepage      = ""
+
+  gem.files         = `git ls-files`.split($\)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.name          = "twitter_ebooks"
+  gem.require_paths = ["lib"]
+  gem.version       = Ebooks::VERSION
+
+  gem.add_runtime_dependency 'minitest'
+
+  gem.add_runtime_dependency 'twitter'
+  gem.add_runtime_dependency 'tweetstream'
+  gem.add_runtime_dependency 'rufus-scheduler'
+  gem.add_runtime_dependency 'gingerice'
+  gem.add_runtime_dependency 'htmlentities'
+  gem.add_runtime_dependency 'engtagger'
+  gem.add_runtime_dependency 'fast-stemmer'
+  gem.add_runtime_dependency 'highscore'
+end