Github time!

2013-11-08 06:02:05 +11:00 · 2013-11-08 06:02:05 +11:00 · e87dc5862b
commit e87dc5862b
27 changed files with 20178 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 .*.swp
 pkg
--- a/4
+++ b/4
@ -0,0 +1,4 @@
 source 'https://rubygems.org'
 # Specify your gem's dependencies in libtcod.gemspec
 gemspec
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -0,0 +1,78 @@
 PATH
  remote: .
  specs:
    twitter_ebooks (2.0.3)
      bloomfilter-rb
      engtagger
      fast-stemmer
      gingerice
      highscore
      htmlentities
      minitest
      rufus-scheduler
      tweetstream
      twitter
 GEM
  remote: https://rubygems.org/
  specs:
    addressable (2.3.5)
    atomic (1.1.14)
    awesome_print (1.2.0)
    bloomfilter-rb (2.1.1)
      redis
    cookiejar (0.3.0)
    daemons (1.1.9)
    em-http-request (1.0.3)
      addressable (>= 2.2.3)
      cookiejar
      em-socksify
      eventmachine (>= 1.0.0.beta.4)
      http_parser.rb (>= 0.5.3)
    em-socksify (0.3.0)
      eventmachine (>= 1.0.0.beta.4)
    em-twitter (0.2.2)
      eventmachine (~> 1.0)
      http_parser.rb (~> 0.5)
      simple_oauth (~> 0.1)
    engtagger (0.1.2)
    eventmachine (1.0.3)
    faraday (0.8.8)
      multipart-post (~> 1.2.0)
    fast-stemmer (1.0.2)
    gingerice (1.2.1)
      addressable
      awesome_print
    highscore (1.1.0)
      whatlanguage (>= 1.0.0)
    htmlentities (4.3.1)
    http_parser.rb (0.5.3)
    minitest (5.0.8)
    multi_json (1.8.2)
    multipart-post (1.2.0)
    redis (3.0.5)
    rufus-scheduler (3.0.2)
      tzinfo
    simple_oauth (0.2.0)
    thread_safe (0.1.3)
      atomic
    tweetstream (2.5.0)
      daemons (~> 1.1)
      em-http-request (~> 1.0.2)
      em-twitter (~> 0.2)
      twitter (~> 4.5)
      yajl-ruby (~> 1.1)
    twitter (4.8.1)
      faraday (~> 0.8, < 0.10)
      multi_json (~> 1.0)
      simple_oauth (~> 0.2)
    tzinfo (1.1.0)
      thread_safe (~> 0.1)
    whatlanguage (1.0.5)
    yajl-ruby (1.1.0)
 PLATFORMS
  ruby
 DEPENDENCIES
  twitter_ebooks!
--- a/22
+++ b/22
@ -0,0 +1,22 @@
 Copyright (c) 2013 Jaiden Mispy
 MIT License
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
 "Software"), to deal in the Software without restriction, including
 without limitation the rights to use, copy, modify, merge, publish,
 distribute, sublicense, and/or sell copies of the Software, and to
 permit persons to whom the Software is furnished to do so, subject to
 the following conditions:
 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/NOTES.md
+++ b/NOTES.md
@ -0,0 +1,4 @@
 - Files in text/ are preprocessed by `rake consume` and serialized
 - e.g. text/foo.tweets becomes consumed/foo.corpus
 - `rake consume` looks at hashes to know which it needs to update
 - Preprocessed corpus files are loaded at runtime by Corpus.load('foo')
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
 # twitter\_ebooks 2.0.7
 Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
 ## Installation
 ```bash
 gem install twitter_ebooks
 ```
--- a/2
+++ b/2
@ -0,0 +1,2 @@
 #!/usr/bin/env rake
 require "bundler/gem_tasks"
--- a/bin/ebooks
+++ b/bin/ebooks
@ -0,0 +1,100 @@
 #!/usr/bin/env ruby
 require 'twitter_ebooks'
 module Ebooks
  APP_PATH = Dir.pwd # XXX do some recursive thing instead
  def self.new(target)
    usage = "Usage: ebooks new <reponame>"
    if target.nil?
      log usage
      exit
    end
    target = "./#{reponame}"
    if File.exists?(target)
      log "#{target} already exists. Please remove if you want to recreate."
      exit
    end
    FileUtils.cp_r(SKELETON_PATH, target) 
    File.open(File.join(target, 'bots.rb'), 'w') do |f|
      template = File.read(File.join(SKELETON_PATH, 'bots.rb'))
      f.write(template.gsub("{{BOT_NAME}}", reponame))
    end
    log "New twitter_ebooks app created at #{target}"
  end
  def self.consume(pathes)
    pathes.each do |path|
      filename = File.basename(path)
      shortname = filename.split('.')[0..-2].join('.')
      hash = Digest::MD5.hexdigest(File.read(path))
      log "Consuming text corpus: #{filename}"
      outpath = File.join(APP_PATH, 'model', "#{shortname}.model")
      Model.consume(path).save(outpath)
      log "Corpus consumed"
    end
  end
  def self.gen(model_path, input)
    model = Model.load(model_path)
    if input && !input.empty?
      puts "@cmd " + model.markov_response(input, 135)
    else
      puts model.markov_statement
    end
  end
  def self.score(model_path, input)
    model = Model.load(model_path)
    model.score_interest(input)
  end
  def self.archive(username, outpath)
    Archiver.new(username, outpath).fetch_tweets
  end
  def self.tweet(modelpath, username)
    load File.join(APP_PATH, 'bots.rb')
    model = Model.load(modelpath)
    statement = model.markov_statement
    log "@#{username}: #{statement}"
    bot = Bot.get(username)
    bot.configure
    bot.tweet(statement)
  end
  def self.command(args)
    usage = """Usage: 
     ebooks new <reponame>
     ebooks consume <corpus_path> [...]
     ebooks gen <model_path> [input]
     ebooks score <model_path> <input>
     ebooks archive <@user> <outpath>
     ebooks tweet <model_path> <@bot>
 """
    if args.length == 0
      log usage
      exit
    end
    case args[0]
    when "new" then new(args[1])
    when "consume" then consume(args[1..-1])
    when "gen" then gen(args[1], args[2..-1].join(' '))
    when "score" then score(args[1], args[2..-1].join(' '))
    when "archive" then archive(args[1], args[2])
    when "tweet" then tweet(args[1], args[2])
    end
  end
 end
 Ebooks.command(ARGV)
--- a/data/adjectives.txt
+++ b/data/adjectives.txt
--- a/data/nouns.txt
+++ b/data/nouns.txt
--- a/data/stopwords.txt
+++ b/data/stopwords.txt
@ -0,0 +1,843 @@
 a
 able
 about
 above
 abst
 accordance
 according
 accordingly
 across
 act
 actually
 added
 adj
 affected
 affecting
 affects
 after
 afterwards
 again
 against
 ah
 all
 almost
 alone
 along
 already
 also
 although
 always
 am
 among
 amongst
 an
 and
 announce
 another
 any
 anybody
 anyhow
 anymore
 anyone
 anything
 anyway
 anyways
 anywhere
 apparently
 approximately
 are
 aren
 arent
 arise
 around
 as
 aside
 ask
 asking
 at
 auth
 available
 away
 awfully
 b
 back
 be
 became
 because
 become
 becomes
 becoming
 been
 before
 beforehand
 begin
 beginning
 beginnings
 begins
 behind
 being
 believe
 below
 beside
 besides
 between
 beyond
 biol
 both
 brief
 briefly
 but
 by
 c
 ca
 came
 can
 cannot
 can't
 cause
 causes
 certain
 certainly
 co
 com
 come
 comes
 contain
 containing
 contains
 could
 couldnt
 d
 date
 did
 didn't
 different
 do
 does
 doesn't
 doing
 done
 don't
 down
 downwards
 due
 during
 e
 each
 ed
 edu
 effect
 eg
 eight
 eighty
 either
 else
 elsewhere
 end
 ending
 enough
 especially
 et
 et-al
 etc
 even
 ever
 every
 everybody
 everyone
 everything
 everywhere
 ex
 except
 f
 far
 few
 ff
 fifth
 first
 five
 fix
 followed
 following
 follows
 for
 former
 formerly
 forth
 found
 four
 from
 further
 furthermore
 g
 gave
 get
 gets
 getting
 give
 given
 gives
 giving
 go
 goes
 gone
 got
 gotten
 h
 had
 happens
 hardly
 has
 hasn't
 have
 haven't
 having
 he
 hed
 hence
 her
 here
 hereafter
 hereby
 herein
 heres
 hereupon
 hers
 herself
 hes
 hi
 hid
 him
 himself
 his
 hither
 home
 how
 howbeit
 however
 hundred
 i
 id
 ie
 if
 i'll
 im
 immediate
 immediately
 importance
 important
 in
 inc
 indeed
 index
 information
 instead
 into
 invention
 inward
 is
 isn't
 it
 itd
 it'll
 its
 itself
 i've
 j
 just
 k
 keep
 keeps
 kept
 kg
 km
 know
 known
 knows
 l
 largely
 last
 lately
 later
 latter
 latterly
 least
 less
 lest
 let
 lets
 like
 liked
 likely
 line
 little
 'll
 look
 looking
 looks
 ltd
 m
 made
 mainly
 make
 makes
 many
 may
 maybe
 me
 mean
 means
 meantime
 meanwhile
 merely
 mg
 might
 million
 miss
 ml
 more
 moreover
 most
 mostly
 mr
 mrs
 much
 mug
 must
 my
 myself
 n
 na
 name
 namely
 nay
 nd
 near
 nearly
 necessarily
 necessary
 need
 needs
 neither
 never
 nevertheless
 new
 next
 nine
 ninety
 no
 nobody
 non
 none
 nonetheless
 noone
 nor
 normally
 nos
 not
 noted
 nothing
 now
 nowhere
 o
 obtain
 obtained
 obviously
 of
 off
 often
 oh
 ok
 okay
 old
 omitted
 on
 once
 one
 ones
 only
 onto
 or
 ord
 other
 others
 otherwise
 ought
 our
 ours
 ourselves
 out
 outside
 over
 overall
 owing
 own
 p
 page
 pages
 part
 particular
 particularly
 past
 per
 perhaps
 placed
 please
 plus
 poorly
 possible
 possibly
 potentially
 pp
 predominantly
 present
 previously
 primarily
 probably
 promptly
 proud
 provides
 put
 q
 que
 quickly
 quite
 qv
 r
 ran
 rather
 rd
 re
 readily
 really
 recent
 recently
 ref
 refs
 regarding
 regardless
 regards
 related
 relatively
 research
 respectively
 resulted
 resulting
 results
 right
 run
 s
 said
 same
 saw
 say
 saying
 says
 sec
 section
 see
 seeing
 seem
 seemed
 seeming
 seems
 seen
 self
 selves
 sent
 seven
 several
 shall
 she
 shed
 she'll
 shes
 should
 shouldn't
 show
 showed
 shown
 showns
 shows
 significant
 significantly
 similar
 similarly
 since
 six
 slightly
 so
 some
 somebody
 somehow
 someone
 somethan
 something
 sometime
 sometimes
 somewhat
 somewhere
 soon
 sorry
 specifically
 specified
 specify
 specifying
 still
 stop
 strongly
 sub
 substantially
 successfully
 such
 sufficiently
 suggest
 sup
 sure
 t
 take
 taken
 taking
 tell
 tends
 th
 than
 thank
 thanks
 thanx
 that
 that'll
 thats
 that've
 the
 their
 theirs
 them
 themselves
 then
 thence
 there
 thereafter
 thereby
 thered
 therefore
 therein
 there'll
 thereof
 therere
 theres
 thereto
 thereupon
 there've
 these
 they
 theyd
 they'll
 theyre
 they've
 think
 this
 those
 thou
 though
 thoughh
 thousand
 throug
 through
 throughout
 thru
 thus
 til
 tip
 to
 together
 too
 took
 toward
 towards
 tried
 tries
 truly
 try
 trying
 ts
 twice
 two
 u
 un
 under
 unfortunately
 unless
 unlike
 unlikely
 until
 unto
 up
 upon
 ups
 us
 use
 used
 useful
 usefully
 usefulness
 uses
 using
 usually
 v
 value
 various
 've
 very
 via
 viz
 vol
 vols
 vs
 w
 want
 wants
 was
 wasn't
 way
 we
 wed
 welcome
 we'll
 went
 were
 weren't
 we've
 what
 whatever
 what'll
 whats
 when
 whence
 whenever
 where
 whereafter
 whereas
 whereby
 wherein
 wheres
 whereupon
 wherever
 whether
 which
 while
 whim
 whither
 who
 whod
 whoever
 whole
 who'll
 whom
 whomever
 whos
 whose
 why
 widely
 willing
 wish
 with
 within
 without
 won't
 words
 world
 would
 wouldn't
 www
 x
 y
 yes
 yet
 you
 youd
 you'll
 your
 youre
 yours
 yourself
 yourselves
 you've
 z
 zero
 .
 ?
 !
 http
 don
 people
 well
 will
 https
 time
 good
 thing
 twitter
 pretty
 it's
 i'm
 that's
 you're
 they're
 there's
 things
 yeah
 find
 going
 work
 point
 years
 guess
 bad
 problem
 real
 kind
 day
 better
 lot
 stuff
 i'd
 read
 thought
 idea
 case
 word
 hey
 person
 long
 Dear
 internet
 tweet
 he's
 feel
 wrong
 call
 hard
 phone
 ago
 literally
 remember
 reason
 called
 course
 bit
 question
 high
 today
 told
 man
 actual
 year
 three
 book
 assume
 life
 true
 best
 wow
 video
 times
 works
 fact
 completely
 totally
 imo
 open
 lol
 haha
 cool
 yep
 ooh
 great
 ugh
 tonight
 talk
 sounds
 hahaha
 whoa
 cool
 we're
 guys
 sweet
 fortunately
 hmm
 aren't
 sadly
 talking
 you'd
 place
 yup
 what's
 y'know
 basically
 god
 shit
 holy
 interesting
 news
 guy
 wait
 oooh
 gonna
 current
 let's
 tomorrow
 omg
 hate
 hope
 fuck
 oops
 night
 wear
 wanna
 fun
 finally
 whoops
 nevermind
 definitely
 context
 screen
 free
 exactly
 big
 house
 half
 working
 play
 heard
 hmmm
 damn
 woah
 tho
 set
 idk
 sort
 understand
 kinda
 seriously
 btw
 she's
 hah
 aww
 ffs
 it'd
 that'd
 hopefully
 non
 entirely
 lots
 entire
 tend
 hullo
 clearly
 surely
 weird
 start
 help
 nope
--- a/lib/twitter_ebooks.rb
+++ b/lib/twitter_ebooks.rb
@ -0,0 +1,20 @@
 gem 'minitest'
 def log(*args)
  STDERR.puts args.map(&:to_s).join(' ')
  STDERR.flush
 end
 module Ebooks
  GEM_PATH = File.expand_path(File.join(File.dirname(__FILE__), '..'))
  DATA_PATH = File.join(GEM_PATH, 'data')
  SKELETON_PATH = File.join(GEM_PATH, 'skeleton')
  TEST_PATH = File.join(GEM_PATH, 'test')
  TEST_CORPUS_PATH = File.join(TEST_PATH, 'corpus/0xabad1dea.tweets')
 end
 require 'twitter_ebooks/nlp'
 require 'twitter_ebooks/archiver'
 require 'twitter_ebooks/markov'
 require 'twitter_ebooks/model'
 require 'twitter_ebooks/bot'
--- a/lib/twitter_ebooks/archiver.rb
+++ b/lib/twitter_ebooks/archiver.rb
@ -0,0 +1,82 @@
 #!/usr/bin/env ruby
 # encoding: utf-8
 require 'twitter'
 module Ebooks
  class Archiver
    def initialize(username, outpath)
      @username = username
      @outpath = outpath
      @client = Twitter::Client.new
    end
    # Read exiting corpus into memory.
    # Return list of tweet lines and the last tweet id.
    def read_corpus
      lines = []
      since_id = nil
      if File.exists?(@outpath)
        lines = File.read(@outpath).split("\n")
        if lines[0].start_with?('#')
          since_id = lines[0].split('# ').last
        end
      end
      [lines, since_id]
    end
    # Retrieve all available tweets for a given user since the last tweet id
    def tweets_since(since_id)
      page = 1
      retries = 0
      tweets = []
      max_id = nil
      opts = {
        count: 200,
        include_rts: false,
        trim_user: true
      }
      opts[:since_id] = since_id unless since_id.nil?
      loop do
        opts[:max_id] = max_id unless max_id.nil?
        new = @client.user_timeline(@username, opts)
        break if new.length <= 1
        puts "Received #{new.length} tweets"
        tweets += new
        max_id = new.last.id
        break
      end
      tweets
    end
    def fetch_tweets
      lines, since_id = read_corpus
      if since_id.nil?
        puts "Retrieving tweets from @#{@username}"
      else
        puts "Retrieving tweets from @#{@username} since #{since_id}"
      end
      tweets = tweets_since(since_id)
      if tweets.length == 0
        puts "No new tweets"
        return
      end
      new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
      new_since_id = tweets[0].id.to_s
      lines = ["# " + new_since_id] + new_lines + lines
      corpus = File.open(@outpath, 'w')
      corpus.write(lines.join("\n"))
      corpus.close
    end
  end
 end
--- a/lib/twitter_ebooks/bot.rb
+++ b/lib/twitter_ebooks/bot.rb
@ -0,0 +1,164 @@
 #!/usr/bin/env ruby
 require 'twitter'
 require 'tweetstream'
 require 'rufus/scheduler'
 module Ebooks
  class Bot
    attr_accessor :consumer_key, :consumer_secret, 
                  :oauth_token, :oauth_token_secret
    attr_accessor :username
    attr_reader :twitter, :stream
    @@all = [] # List of all defined bots
    def self.all; @@all; end
    def self.get(name)
      all.find { |bot| bot.username == name }
    end
    def initialize(username, &b)
      # Set defaults
      @username = username
      # Override with callback
      b.call(self)
      Bot.all.push(self)
    end
    def log(*args)
      STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
      STDERR.flush
    end
    def configure
      TweetStream.configure do |config|
        config.consumer_key = @consumer_key
        config.consumer_secret = @consumer_secret
        config.oauth_token = @oauth_token
        config.oauth_token_secret = @oauth_token_secret
      end
      Twitter.configure do |config|
        config.consumer_key = @consumer_key
        config.consumer_secret = @consumer_secret
        config.oauth_token = @oauth_token
        config.oauth_token_secret = @oauth_token_secret
      end
      @twitter = Twitter::Client.new
      @stream = TweetStream::Client.new
    end
    # Connects to tweetstream and opens event handlers for this bot
    def start
      configure
      @on_startup.call if @on_startup
      @stream.on_error do |msg|
        log "ERROR: #{msg}"
      end
      @stream.on_inited do
        log "Online!"
      end
      @stream.on_event(:follow) do |event|
        next if event[:source][:screen_name] == @username
        log "Followed by #{event[:source][:screen_name]}"
        @on_follow.call(event[:source])
      end
      @stream.on_direct_message do |dm|
        next if dm[:sender][:screen_name] == @username # Don't reply to self
        log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
        @on_message.call(dm)
      end
      @stream.userstream do |ev|
        next unless ev[:text] # If it's not a text-containing tweet, ignore it
        next if ev[:user][:screen_name] == @username # Ignore our own tweets
        meta = {}
        mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
        reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
        reply_mentions = [ev[:user][:screen_name]] + reply_mentions
        meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
        meta[:limit] = 140 - meta[:reply_prefix].length
        mless = ev[:text]
        begin
          ev.attrs[:entities][:user_mentions].reverse.each do |entity|
            mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]...-1]
          end
        rescue Exception
          p ev.attrs[:entities][:user_mentions]
          p ev[:text]
          raise
        end
        meta[:mentionless] = mless
        # To check if this is a mention, ensure:
        # - The tweet mentions list contains our username
        # - The tweet is not being retweeted by somebody else
        # - Or soft-retweeted by somebody else
        if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
          log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
          @on_mention.call(ev, meta)
        else
          @on_timeline.call(ev, meta)
        end
      end
    end
    # Wrapper for EM.add_timer
    # Delays add a greater sense of humanity to bot behaviour
    def delay(time, &b)
      time = time.to_a.sample unless time.is_a? Integer
      EM.add_timer(time, &b)
    end
    # Reply to a tweet or a DM.
    # Applies configurable @reply_delay range
    def reply(ev, text, opts={})
      opts = opts.clone
      delay = @reply_delay.to_a.sample
      if ev.is_a? Twitter::DirectMessage
        log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
        @twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
      elsif ev.is_a? Twitter::Tweet
        log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
        @twitter.update(text, in_reply_to_status_id: ev[:id]) 
      else
        raise Exception("Don't know how to reply to a #{ev.class}")
      end
    end
    def scheduler
      @scheduler ||= Rufus::Scheduler.new
    end
    def follow(*args)
      log "Following #{args}"
      @twitter.follow(*args)
    end
    def tweet(*args)
      log "Tweeting #{args.inspect}"
      @twitter.update(*args)
    end
    def on_startup(&b); @on_startup = b; end
    def on_follow(&b); @on_follow = b; end
    def on_mention(&b); @on_mention = b; end
    def on_timeline(&b); @on_timeline = b; end
    def on_message(&b); @on_message = b; end
  end
 end
--- a/lib/twitter_ebooks/markov.rb
+++ b/lib/twitter_ebooks/markov.rb
@ -0,0 +1,81 @@
 module Ebooks
  # Special INTERIM token represents sentence boundaries
  # This is so we can include start and end of statements in model
  # Due to the way the sentence tokenizer works, can correspond
  # to multiple actual parts of text (such as ^, $, \n and .?!)
  INTERIM = :interim
  # This is an ngram-based Markov model optimized to build from a
  # tokenized sentence list without requiring too much transformation
  class MarkovModel
    def self.build(sentences)
      MarkovModel.new.consume(sentences)
    end
    def consume(sentences)
      # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
      # We map by both bigrams and unigrams so we can fall back to the latter in
      # cases where an input bigram is unavailable, such as starting a sentence
      @sentences = sentences
      @unigrams = {}
      @bigrams = {}
      sentences.each_with_index do |tokens, i|
        last_token = INTERIM
        tokens.each_with_index do |token, j|
          @unigrams[last_token] ||= []
          @unigrams[last_token] << [i, j]
          @bigrams[last_token] ||= {}
          @bigrams[last_token][token] ||= []
          if j == tokens.length-1 # Mark sentence endings
            @unigrams[token] ||= []
            @unigrams[token] << INTERIM
            @bigrams[last_token][token] << INTERIM
          else
            @bigrams[last_token][token] << [i, j+1]
          end
          last_token = token
        end
      end
      self
    end
    def find_token(index)
      if index == INTERIM
        INTERIM
      else
        @sentences[index[0]][index[1]]
      end
    end
    def chain(tokens)
      if tokens.length == 1
        matches = @unigrams[tokens[0]]
      else
        matches = @bigrams[tokens[-2]][tokens[-1]]
      end
      if matches.empty?
        # This should never happen unless a strange token is
        # supplied from outside the dataset
        raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
      end
      next_token = find_token(matches.sample)
      if next_token == INTERIM # We chose to end the sentence
        return tokens
      else
        return chain(tokens + [next_token])
      end
    end
    def generate
      NLP.reconstruct(chain([INTERIM]))
    end
  end
 end
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -0,0 +1,120 @@
 #!/usr/bin/env ruby
 # encoding: utf-8
 require 'json'
 require 'set'
 require 'digest/md5'
 module Ebooks
  class Model
    attr_accessor :hash, :sentences, :markov, :keywords
    def self.consume(txtpath)
      Model.new.consume(txtpath)
    end
    def self.load(path)
      Marshal.load(File.read(path))
    end
    def consume(txtpath)
      # Record hash of source file so we know to update later
      @hash = Digest::MD5.hexdigest(File.read(txtpath))
      text = File.read(txtpath)
      log "Removing commented lines and mention tokens"
      lines = text.split("\n")
      keeping = []
      lines.each do |l|
        next if l.start_with?('#') || l.include?('RT')
        processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
        keeping << processed.join(' ')
      end
      text = NLP.normalize(keeping.join("\n"))
      log "Segmenting text into sentences"
      sentences = NLP.sentences(text)
      log "Tokenizing #{sentences.length} sentences"
      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
      log "Ranking keywords"
      @keywords = NLP.keywords(@sentences)
      self
    end
    def save(path)
      File.open(path, 'w') do |f|
        f.write(Marshal.dump(self))
      end
      self
    end
    def fix(tweet)
      # This seems to require an external api call
      #begin
      #  fixer = NLP.gingerice.parse(tweet)
      #  log fixer if fixer['corrections']
      #  tweet = fixer['result']
      #rescue Exception => e
      #  log e.message
      #  log e.backtrace
      #end
      NLP.htmlentities.decode tweet
    end
    def markov_statement(limit=140, markov=nil)
      markov ||= MarkovModel.build(@sentences)
      tweet = ""
      while (tweet = markov.generate) do
        next if tweet.length > limit
        next if NLP.unmatched_enclosers?(tweet)
        break if tweet.length > limit*0.4 || rand > 0.8
      end
      fix tweet
    end
    # Finds all relevant tokenized sentences to given input by
    # comparing non-stopword token overlaps
    def relevant_sentences(input)
      relevant = []
      slightly_relevant = []
      tokenized = NLP.tokenize(input)
      @sentences.each do |sent|
        tokenized.each do |token|
          if sent.include?(token)
            relevant << sent unless NLP.stopword?(token)
            slightly_relevant << sent
          end
        end
      end
      [relevant, slightly_relevant]
    end
    # Generates a response by looking for related sentences
    # in the corpus and building a smaller markov model from these
    def markov_response(input, limit=140)
      # First try 
      relevant, slightly_relevant = relevant_sentences(input)
      if relevant.length >= 3
        markov = MarkovModel.new.consume(relevant)
        markov_statement(limit, markov)
      elsif slightly_relevant.length > 5
        markov = MarkovModel.new.consume(slightly_relevant)
        markov_statement(limit, markov)
      else
        markov_statement(limit)
      end
    end
  end
 end
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -0,0 +1,154 @@
 # encoding: utf-8
 require 'fast-stemmer'
 require 'highscore'
 module Ebooks
  module NLP
    # We deliberately limit our punctuation handling to stuff we can do consistently
    # It'll just be a part of another token if we don't split it out, and that's fine
    PUNCTUATION = ".?!,"
    # Lazy-load NLP libraries and resources
    # Some of this stuff is pretty heavy and we don't necessarily need
    # to be using it all of the time
    def self.stopwords
      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
    end
    def self.nouns
      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
    end
    def self.adjectives
      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
    end
    # POS tagger
    def self.tagger
      require 'engtagger'
      @tagger ||= EngTagger.new
    end
    # Gingerice text correction service
    def self.gingerice
      require 'gingerice'
      Gingerice::Parser.new # No caching for this one
    end
    # For decoding html entities
    def self.htmlentities
      require 'htmlentities'
      @htmlentities ||= HTMLEntities.new
    end
    ### Utility functions
    # We don't really want to deal with all this weird unicode punctuation
    def self.normalize(text)
      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
    end
    # Split text into sentences
    # We use ad hoc approach because fancy libraries do not deal
    # especially well with tweet formatting, and we can fake solving
    # the quote problem during generation
    def self.sentences(text)
      text.split(/\n+|(?<=[.?!])\s+/)
    end
    # Split a sentence into word-level tokens
    # As above, this is ad hoc because tokenization libraries
    # do not behave well wrt. things like emoticons and timestamps
    def self.tokenize(sentence)
      regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
      sentence.split(regex)
    end
    def self.stem(word)
      Stemmer::stem_word(word.downcase)
    end
    def self.keywords(sentences)
      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
      text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
      text = Highscore::Content.new(text)
      text.configure do
        #set :multiplier, 2
        #set :upper_case, 3
        #set :long_words, 2
        #set :long_words_threshold, 15
        #set :vowels, 1                     # => default: 0 = not considered
        #set :consonants, 5                 # => default: 0 = not considered
        #set :ignore_case, true             # => default: false
        set :word_pattern, /(?<!@)(?<=\s)[\w']+/           # => default: /\w+/
        #set :stemming, true                # => default: false
      end
      text.keywords
    end
    # Takes a list of tokens and builds a nice-looking sentence
    def self.reconstruct(tokens)
      text = ""
      last_token = nil
      tokens.each do |token|
        next if token == INTERIM
        text += ' ' if last_token && space_between?(last_token, token)
        text += token
        last_token = token
      end
      text
    end
    # Determine if we need to insert a space between two tokens
    def self.space_between?(token1, token2)
      p1 = self.punctuation?(token1)
      p2 = self.punctuation?(token2)
      if p1 && p2 # "foo?!"
        false
      elsif !p1 && p2 # "foo."
        false
      elsif p1 && !p2 # "foo. rah"
        true
      else # "foo rah"
        true
      end
    end
    def self.punctuation?(token)
      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
    end
    def self.stopword?(token)
      @stopword_set ||= stopwords.map(&:downcase).to_set
      @stopword_set.include?(token.downcase)
    end
    # Determine if a sample of text contains unmatched brackets or quotes
    # This is one of the more frequent and noticeable failure modes for
    # the markov generator; we can just tell it to retry
    def self.unmatched_enclosers?(text)
      enclosers = ['**', '""', '()', '[]', '``', "''"]
      enclosers.each do |pair|
        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
        opened = 0
        tokenize(text).each do |token|
          opened += 1 if token.match(starter)
          opened -= 1 if token.match(ender)
          return true if opened < 0 # Too many ends!
        end
        return true if opened != 0 # Mismatch somewhere.
      end
      false
    end
  end
 end
--- a/lib/twitter_ebooks/version.rb
+++ b/lib/twitter_ebooks/version.rb
@ -0,0 +1,3 @@
 module Ebooks
  VERSION = "2.0.7"
 end
--- a/script/process_anc_data.rb
+++ b/script/process_anc_data.rb
@ -0,0 +1,19 @@
 #!/usr/bin/env ruby
 # encoding: utf-8
 require 'json'
 freqmap = {}
 data = File.read("data/ANC-all-count.txt")
 data = data.unpack("C*").pack("U*")
 data.lines.each do |l|
  vals = l.split("\t")
  freqmap[vals[0]] = vals[-1].to_i
 end
 File.open("data/wordfreq.json", 'w') do |f|
  f.write(JSON.dump(freqmap))
 end
--- a/skeleton/.gitignore
+++ b/skeleton/.gitignore
@ -0,0 +1 @@
 corpus/*
--- a/skeleton/Procfile
+++ b/skeleton/Procfile
@ -0,0 +1 @@
 worker: ruby run.rb start
--- a/skeleton/bots.rb
+++ b/skeleton/bots.rb
@ -0,0 +1,41 @@
 #!/usr/bin/env ruby
 require 'twitter_ebooks'
 # This is an example bot definition with event handlers commented out
 # You can define as many of these as you like; they will run simultaneously
 Ebooks::Bot.new("{{BOT_NAME}}") do |bot|
  # Consumer details come from registering an app at https://dev.twitter.com/
  # OAuth details can be fetched with https://github.com/marcel/twurl
  bot.consumer_key = "" # Your app consumer key
  bot.consumer_secret = "" # Your app consumer secret
  bot.oauth_token = "" # Token connecting the app to this account
  bot.oauth_token_secret = "" # Secret connecting the app to this account
  bot.on_message do |dm|
    # Reply to a DM
    # bot.reply(dm, "secret secrets")
  end
  bot.on_follow do |user|
    # Follow a user back
    # bot.follow(user[:screen_name])
  end
  bot.on_mention do |tweet, meta|
    # Reply to a mention
    # bot.reply(tweet, meta[:reply_prefix] + "oh hullo")
  end
  bot.on_timeline do |tweet, meta|
    # Reply to a tweet in the bot's timeline
    # bot.reply(tweet, meta[:reply_prefix] + "nice tweet")
  end
  bot.scheduler.every '24h' do
    # Tweet something every 24 hours
    # See https://github.com/jmettraux/rufus-scheduler
    # bot.tweet("hi")
  end
 end
--- a/skeleton/run.rb
+++ b/skeleton/run.rb
@ -0,0 +1,9 @@
 #!/usr/bin/env ruby
 require_relative 'bots'
 EM.run do
 Ebooks::Bot.all.each do |bot|
    bot.start
  end
 end
--- a/test/corpus/0xabad1dea.tweets
+++ b/test/corpus/0xabad1dea.tweets
--- a/test/keywords.rb
+++ b/test/keywords.rb
@ -0,0 +1,18 @@
 #!/usr/bin/env ruby
 # encoding: utf-8
 require 'twitter_ebooks'
 require 'minitest/autorun'
 require 'benchmark'
 module Ebooks
  class TestKeywords < Minitest::Test
    corpus = NLP.normalize(File.read(ARGV[0]))
    puts "Finding and ranking keywords"
    puts Benchmark.measure {
      NLP.keywords(corpus).top(50).each do |keyword|
        puts "#{keyword.text} #{keyword.weight}"
      end
    }
  end
 end
--- a/test/tokenize.rb
+++ b/test/tokenize.rb
@ -0,0 +1,18 @@
 #!/usr/bin/env ruby
 # encoding: utf-8
 require 'twitter_ebooks'
 require 'minitest/autorun'
 module Ebooks
  class TestTokenize < Minitest::Test
    corpus = NLP.normalize(File.read(TEST_CORPUS_PATH))
    sents = NLP.sentences(corpus).sample(10)
    NLP.sentences(corpus).sample(10).each do |sent|
      p sent
      p NLP.tokenize(sent)
      puts
    end
  end
 end
--- a/twitter_ebooks.gemspec
+++ b/twitter_ebooks.gemspec
@ -0,0 +1,28 @@
 # -*- encoding: utf-8 -*-
 require File.expand_path('../lib/twitter_ebooks/version', __FILE__)
 Gem::Specification.new do |gem|
  gem.authors       = ["Jaiden Mispy"]
  gem.email         = ["^_^@mispy.me"]
  gem.description   = %q{Markov chains for all your friends~}
  gem.summary       = %q{Markov chains for all your friends~}
  gem.homepage      = ""
  gem.files         = `git ls-files`.split($\)
  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
  gem.name          = "twitter_ebooks"
  gem.require_paths = ["lib"]
  gem.version       = Ebooks::VERSION
  gem.add_runtime_dependency 'minitest'
  gem.add_runtime_dependency 'twitter'
  gem.add_runtime_dependency 'tweetstream'
  gem.add_runtime_dependency 'rufus-scheduler'
  gem.add_runtime_dependency 'gingerice'
  gem.add_runtime_dependency 'htmlentities'
  gem.add_runtime_dependency 'engtagger'
  gem.add_runtime_dependency 'fast-stemmer'
  gem.add_runtime_dependency 'highscore'
 end
		`@ -0,0 +1,2 @@`
							`#!/usr/bin/env rake`
							`require "bundler/gem_tasks"`