Github time!

2013-11-08 06:02:05 +11:00 · 2013-11-08 06:02:05 +11:00 · e87dc5862b
commit e87dc5862b
27 changed files with 20178 additions and 0 deletions
--- a/lib/twitter_ebooks/archiver.rb
+++ b/lib/twitter_ebooks/archiver.rb
@ -0,0 +1,82 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'twitter'
+
+module Ebooks
+  class Archiver
+    def initialize(username, outpath)
+      @username = username
+      @outpath = outpath
+      @client = Twitter::Client.new
+    end
+
+    # Read exiting corpus into memory.
+    # Return list of tweet lines and the last tweet id.
+    def read_corpus
+      lines = []
+      since_id = nil
+
+      if File.exists?(@outpath)
+        lines = File.read(@outpath).split("\n")
+        if lines[0].start_with?('#')
+          since_id = lines[0].split('# ').last
+        end
+      end
+
+      [lines, since_id]
+    end
+
+    # Retrieve all available tweets for a given user since the last tweet id
+    def tweets_since(since_id)
+      page = 1
+      retries = 0
+      tweets = []
+      max_id = nil
+
+      opts = {
+        count: 200,
+        include_rts: false,
+        trim_user: true
+      }
+
+      opts[:since_id] = since_id unless since_id.nil?
+
+      loop do
+        opts[:max_id] = max_id unless max_id.nil?
+        new = @client.user_timeline(@username, opts)
+        break if new.length <= 1
+        puts "Received #{new.length} tweets"
+        tweets += new
+        max_id = new.last.id
+        break
+      end
+
+      tweets
+    end
+
+    def fetch_tweets
+      lines, since_id = read_corpus
+
+      if since_id.nil?
+        puts "Retrieving tweets from @#{@username}"
+      else
+        puts "Retrieving tweets from @#{@username} since #{since_id}"
+      end
+
+      tweets = tweets_since(since_id)
+
+      if tweets.length == 0
+        puts "No new tweets"
+        return
+      end
+
+      new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
+      new_since_id = tweets[0].id.to_s
+      lines = ["# " + new_since_id] + new_lines + lines
+      corpus = File.open(@outpath, 'w')
+      corpus.write(lines.join("\n"))
+      corpus.close
+    end
+  end
+end
--- a/lib/twitter_ebooks/bot.rb
+++ b/lib/twitter_ebooks/bot.rb
@ -0,0 +1,164 @@
+#!/usr/bin/env ruby
+require 'twitter'
+require 'tweetstream'
+require 'rufus/scheduler'
+
+module Ebooks
+  class Bot
+    attr_accessor :consumer_key, :consumer_secret, 
+                  :oauth_token, :oauth_token_secret
+
+    attr_accessor :username
+
+    attr_reader :twitter, :stream
+
+    @@all = [] # List of all defined bots
+    def self.all; @@all; end
+
+    def self.get(name)
+      all.find { |bot| bot.username == name }
+    end
+
+    def initialize(username, &b)
+      # Set defaults
+      @username = username
+
+      # Override with callback
+      b.call(self)
+
+      Bot.all.push(self)
+    end
+
+    def log(*args)
+      STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
+      STDERR.flush
+    end
+
+    def configure
+      TweetStream.configure do |config|
+        config.consumer_key = @consumer_key
+        config.consumer_secret = @consumer_secret
+        config.oauth_token = @oauth_token
+        config.oauth_token_secret = @oauth_token_secret
+      end
+
+      Twitter.configure do |config|
+        config.consumer_key = @consumer_key
+        config.consumer_secret = @consumer_secret
+        config.oauth_token = @oauth_token
+        config.oauth_token_secret = @oauth_token_secret
+      end
+
+      @twitter = Twitter::Client.new
+      @stream = TweetStream::Client.new
+    end
+
+    # Connects to tweetstream and opens event handlers for this bot
+    def start
+      configure
+
+      @on_startup.call if @on_startup
+
+      @stream.on_error do |msg|
+        log "ERROR: #{msg}"
+      end
+
+      @stream.on_inited do
+        log "Online!"
+      end
+
+      @stream.on_event(:follow) do |event|
+        next if event[:source][:screen_name] == @username
+        log "Followed by #{event[:source][:screen_name]}"
+        @on_follow.call(event[:source])
+      end
+
+      @stream.on_direct_message do |dm|
+        next if dm[:sender][:screen_name] == @username # Don't reply to self
+        log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
+        @on_message.call(dm)
+      end
+
+      @stream.userstream do |ev|
+        next unless ev[:text] # If it's not a text-containing tweet, ignore it
+        next if ev[:user][:screen_name] == @username # Ignore our own tweets
+
+        meta = {}
+        mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
+
+        reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
+        reply_mentions = [ev[:user][:screen_name]] + reply_mentions
+
+        meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
+        meta[:limit] = 140 - meta[:reply_prefix].length
+
+        mless = ev[:text]
+        begin
+          ev.attrs[:entities][:user_mentions].reverse.each do |entity|
+            mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]...-1]
+          end
+        rescue Exception
+          p ev.attrs[:entities][:user_mentions]
+          p ev[:text]
+          raise
+        end
+        meta[:mentionless] = mless
+
+        # To check if this is a mention, ensure:
+        # - The tweet mentions list contains our username
+        # - The tweet is not being retweeted by somebody else
+        # - Or soft-retweeted by somebody else
+        if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
+          log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
+          @on_mention.call(ev, meta)
+        else
+          @on_timeline.call(ev, meta)
+        end
+      end
+    end
+
+    # Wrapper for EM.add_timer
+    # Delays add a greater sense of humanity to bot behaviour
+    def delay(time, &b)
+      time = time.to_a.sample unless time.is_a? Integer
+      EM.add_timer(time, &b)
+    end
+
+    # Reply to a tweet or a DM.
+    # Applies configurable @reply_delay range
+    def reply(ev, text, opts={})
+      opts = opts.clone
+      delay = @reply_delay.to_a.sample
+
+      if ev.is_a? Twitter::DirectMessage
+        log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
+        @twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
+      elsif ev.is_a? Twitter::Tweet
+        log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
+        @twitter.update(text, in_reply_to_status_id: ev[:id]) 
+      else
+        raise Exception("Don't know how to reply to a #{ev.class}")
+      end
+    end
+
+    def scheduler
+      @scheduler ||= Rufus::Scheduler.new
+    end
+
+    def follow(*args)
+      log "Following #{args}"
+      @twitter.follow(*args)
+    end
+
+    def tweet(*args)
+      log "Tweeting #{args.inspect}"
+      @twitter.update(*args)
+    end
+
+    def on_startup(&b); @on_startup = b; end
+    def on_follow(&b); @on_follow = b; end
+    def on_mention(&b); @on_mention = b; end
+    def on_timeline(&b); @on_timeline = b; end
+    def on_message(&b); @on_message = b; end
+  end
+end
--- a/lib/twitter_ebooks/markov.rb
+++ b/lib/twitter_ebooks/markov.rb
@ -0,0 +1,81 @@
+module Ebooks
+  # Special INTERIM token represents sentence boundaries
+  # This is so we can include start and end of statements in model
+  # Due to the way the sentence tokenizer works, can correspond
+  # to multiple actual parts of text (such as ^, $, \n and .?!)
+  INTERIM = :interim
+
+  # This is an ngram-based Markov model optimized to build from a
+  # tokenized sentence list without requiring too much transformation
+  class MarkovModel
+    def self.build(sentences)
+      MarkovModel.new.consume(sentences)
+    end
+
+    def consume(sentences)
+      # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
+      # We map by both bigrams and unigrams so we can fall back to the latter in
+      # cases where an input bigram is unavailable, such as starting a sentence
+      @sentences = sentences
+      @unigrams = {}
+      @bigrams = {}
+
+      sentences.each_with_index do |tokens, i|
+        last_token = INTERIM
+        tokens.each_with_index do |token, j|
+          @unigrams[last_token] ||= []
+          @unigrams[last_token] << [i, j]
+
+          @bigrams[last_token] ||= {}
+          @bigrams[last_token][token] ||= []
+
+          if j == tokens.length-1 # Mark sentence endings
+            @unigrams[token] ||= []
+            @unigrams[token] << INTERIM
+            @bigrams[last_token][token] << INTERIM
+          else
+            @bigrams[last_token][token] << [i, j+1]
+          end
+
+          last_token = token
+        end
+      end
+
+      self
+    end
+
+    def find_token(index)
+      if index == INTERIM
+        INTERIM
+      else
+        @sentences[index[0]][index[1]]
+      end
+    end
+
+    def chain(tokens)
+      if tokens.length == 1
+        matches = @unigrams[tokens[0]]
+      else
+        matches = @bigrams[tokens[-2]][tokens[-1]]
+      end
+
+      if matches.empty?
+        # This should never happen unless a strange token is
+        # supplied from outside the dataset
+        raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
+      end
+
+      next_token = find_token(matches.sample)
+
+      if next_token == INTERIM # We chose to end the sentence
+        return tokens
+      else
+        return chain(tokens + [next_token])
+      end
+    end
+
+    def generate
+      NLP.reconstruct(chain([INTERIM]))
+    end
+  end
+end
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -0,0 +1,120 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'json'
+require 'set'
+require 'digest/md5'
+
+module Ebooks
+  class Model
+    attr_accessor :hash, :sentences, :markov, :keywords
+
+    def self.consume(txtpath)
+      Model.new.consume(txtpath)
+    end
+
+    def self.load(path)
+      Marshal.load(File.read(path))
+    end
+
+    def consume(txtpath)
+      # Record hash of source file so we know to update later
+      @hash = Digest::MD5.hexdigest(File.read(txtpath))
+
+      text = File.read(txtpath)
+      log "Removing commented lines and mention tokens"
+
+      lines = text.split("\n")
+      keeping = []
+      lines.each do |l|
+        next if l.start_with?('#') || l.include?('RT')
+        processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
+        keeping << processed.join(' ')
+      end
+      text = NLP.normalize(keeping.join("\n"))
+
+      log "Segmenting text into sentences"
+
+      sentences = NLP.sentences(text)
+
+      log "Tokenizing #{sentences.length} sentences"
+      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
+
+      log "Ranking keywords"
+      @keywords = NLP.keywords(@sentences)
+
+      self
+    end
+
+    def save(path)
+      File.open(path, 'w') do |f|
+        f.write(Marshal.dump(self))
+      end
+      self
+    end
+
+    def fix(tweet)
+      # This seems to require an external api call
+      #begin
+      #  fixer = NLP.gingerice.parse(tweet)
+      #  log fixer if fixer['corrections']
+      #  tweet = fixer['result']
+      #rescue Exception => e
+      #  log e.message
+      #  log e.backtrace
+      #end
+
+      NLP.htmlentities.decode tweet
+    end
+
+    def markov_statement(limit=140, markov=nil)
+      markov ||= MarkovModel.build(@sentences)
+      tweet = ""
+
+      while (tweet = markov.generate) do
+        next if tweet.length > limit
+        next if NLP.unmatched_enclosers?(tweet)
+        break if tweet.length > limit*0.4 || rand > 0.8
+      end
+
+      fix tweet
+    end
+
+    # Finds all relevant tokenized sentences to given input by
+    # comparing non-stopword token overlaps
+    def relevant_sentences(input)
+      relevant = []
+      slightly_relevant = []
+
+      tokenized = NLP.tokenize(input)
+
+      @sentences.each do |sent|
+        tokenized.each do |token|
+          if sent.include?(token)
+            relevant << sent unless NLP.stopword?(token)
+            slightly_relevant << sent
+          end
+        end
+      end
+
+      [relevant, slightly_relevant]
+    end
+
+    # Generates a response by looking for related sentences
+    # in the corpus and building a smaller markov model from these
+    def markov_response(input, limit=140)
+      # First try 
+      relevant, slightly_relevant = relevant_sentences(input)
+
+      if relevant.length >= 3
+        markov = MarkovModel.new.consume(relevant)
+        markov_statement(limit, markov)
+      elsif slightly_relevant.length > 5
+        markov = MarkovModel.new.consume(slightly_relevant)
+        markov_statement(limit, markov)
+      else
+        markov_statement(limit)
+      end
+    end
+  end
+end
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -0,0 +1,154 @@
+# encoding: utf-8
+require 'fast-stemmer'
+require 'highscore'
+
+module Ebooks
+  module NLP
+    # We deliberately limit our punctuation handling to stuff we can do consistently
+    # It'll just be a part of another token if we don't split it out, and that's fine
+    PUNCTUATION = ".?!,"
+
+    # Lazy-load NLP libraries and resources
+    # Some of this stuff is pretty heavy and we don't necessarily need
+    # to be using it all of the time
+
+    def self.stopwords
+      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
+    end
+
+    def self.nouns
+      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
+    end
+
+    def self.adjectives
+      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
+    end
+
+    # POS tagger
+    def self.tagger
+      require 'engtagger'
+      @tagger ||= EngTagger.new
+    end
+
+    # Gingerice text correction service
+    def self.gingerice
+      require 'gingerice'
+      Gingerice::Parser.new # No caching for this one
+    end
+
+    # For decoding html entities
+    def self.htmlentities
+      require 'htmlentities'
+      @htmlentities ||= HTMLEntities.new
+    end
+
+    ### Utility functions
+    
+    # We don't really want to deal with all this weird unicode punctuation
+    def self.normalize(text)
+      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
+    end
+
+    # Split text into sentences
+    # We use ad hoc approach because fancy libraries do not deal
+    # especially well with tweet formatting, and we can fake solving
+    # the quote problem during generation
+    def self.sentences(text)
+      text.split(/\n+|(?<=[.?!])\s+/)
+    end
+
+    # Split a sentence into word-level tokens
+    # As above, this is ad hoc because tokenization libraries
+    # do not behave well wrt. things like emoticons and timestamps
+    def self.tokenize(sentence)
+      regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
+      sentence.split(regex)
+    end
+
+    def self.stem(word)
+      Stemmer::stem_word(word.downcase)
+    end
+
+    def self.keywords(sentences)
+      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
+      text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
+
+      text = Highscore::Content.new(text)
+
+      text.configure do
+        #set :multiplier, 2
+        #set :upper_case, 3
+        #set :long_words, 2
+        #set :long_words_threshold, 15
+        #set :vowels, 1                     # => default: 0 = not considered
+        #set :consonants, 5                 # => default: 0 = not considered
+        #set :ignore_case, true             # => default: false
+        set :word_pattern, /(?<!@)(?<=\s)[\w']+/           # => default: /\w+/
+        #set :stemming, true                # => default: false
+      end
+
+      text.keywords
+    end
+
+    # Takes a list of tokens and builds a nice-looking sentence
+    def self.reconstruct(tokens)
+      text = ""
+      last_token = nil
+      tokens.each do |token|
+        next if token == INTERIM
+        text += ' ' if last_token && space_between?(last_token, token)
+        text += token
+        last_token = token
+      end
+      text
+    end
+
+    # Determine if we need to insert a space between two tokens
+    def self.space_between?(token1, token2)
+      p1 = self.punctuation?(token1)
+      p2 = self.punctuation?(token2)
+      if p1 && p2 # "foo?!"
+        false
+      elsif !p1 && p2 # "foo."
+        false
+      elsif p1 && !p2 # "foo. rah"
+        true
+      else # "foo rah"
+        true
+      end
+    end
+
+    def self.punctuation?(token)
+      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
+    end
+
+    def self.stopword?(token)
+      @stopword_set ||= stopwords.map(&:downcase).to_set
+      @stopword_set.include?(token.downcase)
+    end
+
+    # Determine if a sample of text contains unmatched brackets or quotes
+    # This is one of the more frequent and noticeable failure modes for
+    # the markov generator; we can just tell it to retry
+    def self.unmatched_enclosers?(text)
+      enclosers = ['**', '""', '()', '[]', '``', "''"]
+      enclosers.each do |pair|
+        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
+        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
+
+        opened = 0
+
+        tokenize(text).each do |token|
+          opened += 1 if token.match(starter)
+          opened -= 1 if token.match(ender)
+
+          return true if opened < 0 # Too many ends!
+        end
+
+        return true if opened != 0 # Mismatch somewhere.
+      end
+
+      false
+    end
+  end
+end
--- a/lib/twitter_ebooks/version.rb
+++ b/lib/twitter_ebooks/version.rb
@ -0,0 +1,3 @@
+module Ebooks
+  VERSION = "2.0.7"
+end