Github time!
This commit is contained in:
		
						commit
						e87dc5862b
					
				
					 27 changed files with 20178 additions and 0 deletions
				
			
		
							
								
								
									
										82
									
								
								lib/twitter_ebooks/archiver.rb
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								lib/twitter_ebooks/archiver.rb
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,82 @@
 | 
			
		|||
#!/usr/bin/env ruby
 | 
			
		||||
# encoding: utf-8
 | 
			
		||||
 | 
			
		||||
require 'twitter'
 | 
			
		||||
 | 
			
		||||
module Ebooks
 | 
			
		||||
  class Archiver
 | 
			
		||||
    def initialize(username, outpath)
 | 
			
		||||
      @username = username
 | 
			
		||||
      @outpath = outpath
 | 
			
		||||
      @client = Twitter::Client.new
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Read exiting corpus into memory.
 | 
			
		||||
    # Return list of tweet lines and the last tweet id.
 | 
			
		||||
    def read_corpus
 | 
			
		||||
      lines = []
 | 
			
		||||
      since_id = nil
 | 
			
		||||
 | 
			
		||||
      if File.exists?(@outpath)
 | 
			
		||||
        lines = File.read(@outpath).split("\n")
 | 
			
		||||
        if lines[0].start_with?('#')
 | 
			
		||||
          since_id = lines[0].split('# ').last
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      [lines, since_id]
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Retrieve all available tweets for a given user since the last tweet id
 | 
			
		||||
    def tweets_since(since_id)
 | 
			
		||||
      page = 1
 | 
			
		||||
      retries = 0
 | 
			
		||||
      tweets = []
 | 
			
		||||
      max_id = nil
 | 
			
		||||
 | 
			
		||||
      opts = {
 | 
			
		||||
        count: 200,
 | 
			
		||||
        include_rts: false,
 | 
			
		||||
        trim_user: true
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      opts[:since_id] = since_id unless since_id.nil?
 | 
			
		||||
 | 
			
		||||
      loop do
 | 
			
		||||
        opts[:max_id] = max_id unless max_id.nil?
 | 
			
		||||
        new = @client.user_timeline(@username, opts)
 | 
			
		||||
        break if new.length <= 1
 | 
			
		||||
        puts "Received #{new.length} tweets"
 | 
			
		||||
        tweets += new
 | 
			
		||||
        max_id = new.last.id
 | 
			
		||||
        break
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      tweets
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def fetch_tweets
 | 
			
		||||
      lines, since_id = read_corpus
 | 
			
		||||
 | 
			
		||||
      if since_id.nil?
 | 
			
		||||
        puts "Retrieving tweets from @#{@username}"
 | 
			
		||||
      else
 | 
			
		||||
        puts "Retrieving tweets from @#{@username} since #{since_id}"
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      tweets = tweets_since(since_id)
 | 
			
		||||
 | 
			
		||||
      if tweets.length == 0
 | 
			
		||||
        puts "No new tweets"
 | 
			
		||||
        return
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
 | 
			
		||||
      new_since_id = tweets[0].id.to_s
 | 
			
		||||
      lines = ["# " + new_since_id] + new_lines + lines
 | 
			
		||||
      corpus = File.open(@outpath, 'w')
 | 
			
		||||
      corpus.write(lines.join("\n"))
 | 
			
		||||
      corpus.close
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
							
								
								
									
										164
									
								
								lib/twitter_ebooks/bot.rb
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										164
									
								
								lib/twitter_ebooks/bot.rb
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,164 @@
 | 
			
		|||
#!/usr/bin/env ruby
 | 
			
		||||
require 'twitter'
 | 
			
		||||
require 'tweetstream'
 | 
			
		||||
require 'rufus/scheduler'
 | 
			
		||||
 | 
			
		||||
module Ebooks
 | 
			
		||||
  class Bot
 | 
			
		||||
    attr_accessor :consumer_key, :consumer_secret, 
 | 
			
		||||
                  :oauth_token, :oauth_token_secret
 | 
			
		||||
 | 
			
		||||
    attr_accessor :username
 | 
			
		||||
 | 
			
		||||
    attr_reader :twitter, :stream
 | 
			
		||||
 | 
			
		||||
    @@all = [] # List of all defined bots
 | 
			
		||||
    def self.all; @@all; end
 | 
			
		||||
 | 
			
		||||
    def self.get(name)
 | 
			
		||||
      all.find { |bot| bot.username == name }
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def initialize(username, &b)
 | 
			
		||||
      # Set defaults
 | 
			
		||||
      @username = username
 | 
			
		||||
 | 
			
		||||
      # Override with callback
 | 
			
		||||
      b.call(self)
 | 
			
		||||
 | 
			
		||||
      Bot.all.push(self)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def log(*args)
 | 
			
		||||
      STDERR.puts "@#{@username}: " + args.map(&:to_s).join(' ')
 | 
			
		||||
      STDERR.flush
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def configure
 | 
			
		||||
      TweetStream.configure do |config|
 | 
			
		||||
        config.consumer_key = @consumer_key
 | 
			
		||||
        config.consumer_secret = @consumer_secret
 | 
			
		||||
        config.oauth_token = @oauth_token
 | 
			
		||||
        config.oauth_token_secret = @oauth_token_secret
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      Twitter.configure do |config|
 | 
			
		||||
        config.consumer_key = @consumer_key
 | 
			
		||||
        config.consumer_secret = @consumer_secret
 | 
			
		||||
        config.oauth_token = @oauth_token
 | 
			
		||||
        config.oauth_token_secret = @oauth_token_secret
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      @twitter = Twitter::Client.new
 | 
			
		||||
      @stream = TweetStream::Client.new
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Connects to tweetstream and opens event handlers for this bot
 | 
			
		||||
    def start
 | 
			
		||||
      configure
 | 
			
		||||
 | 
			
		||||
      @on_startup.call if @on_startup
 | 
			
		||||
 | 
			
		||||
      @stream.on_error do |msg|
 | 
			
		||||
        log "ERROR: #{msg}"
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      @stream.on_inited do
 | 
			
		||||
        log "Online!"
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      @stream.on_event(:follow) do |event|
 | 
			
		||||
        next if event[:source][:screen_name] == @username
 | 
			
		||||
        log "Followed by #{event[:source][:screen_name]}"
 | 
			
		||||
        @on_follow.call(event[:source])
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      @stream.on_direct_message do |dm|
 | 
			
		||||
        next if dm[:sender][:screen_name] == @username # Don't reply to self
 | 
			
		||||
        log "DM from @#{dm[:sender][:screen_name]}: #{dm[:text]}"
 | 
			
		||||
        @on_message.call(dm)
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      @stream.userstream do |ev|
 | 
			
		||||
        next unless ev[:text] # If it's not a text-containing tweet, ignore it
 | 
			
		||||
        next if ev[:user][:screen_name] == @username # Ignore our own tweets
 | 
			
		||||
 | 
			
		||||
        meta = {}
 | 
			
		||||
        mentions = ev.attrs[:entities][:user_mentions].map { |x| x[:screen_name] }
 | 
			
		||||
 | 
			
		||||
        reply_mentions = mentions.reject { |m| m.downcase == @username.downcase }
 | 
			
		||||
        reply_mentions = [ev[:user][:screen_name]] + reply_mentions
 | 
			
		||||
 | 
			
		||||
        meta[:reply_prefix] = reply_mentions.uniq.map { |m| '@'+m }.join(' ') + ' '
 | 
			
		||||
        meta[:limit] = 140 - meta[:reply_prefix].length
 | 
			
		||||
 | 
			
		||||
        mless = ev[:text]
 | 
			
		||||
        begin
 | 
			
		||||
          ev.attrs[:entities][:user_mentions].reverse.each do |entity|
 | 
			
		||||
            mless = mless[0...entity[:indices][0]] + mless[entity[:indices][1]...-1]
 | 
			
		||||
          end
 | 
			
		||||
        rescue Exception
 | 
			
		||||
          p ev.attrs[:entities][:user_mentions]
 | 
			
		||||
          p ev[:text]
 | 
			
		||||
          raise
 | 
			
		||||
        end
 | 
			
		||||
        meta[:mentionless] = mless
 | 
			
		||||
 | 
			
		||||
        # To check if this is a mention, ensure:
 | 
			
		||||
        # - The tweet mentions list contains our username
 | 
			
		||||
        # - The tweet is not being retweeted by somebody else
 | 
			
		||||
        # - Or soft-retweeted by somebody else
 | 
			
		||||
        if mentions.map(&:downcase).include?(@username.downcase) && !ev[:retweeted_status] && !ev[:text].start_with?('RT ')
 | 
			
		||||
          log "Mention from @#{ev[:user][:screen_name]}: #{ev[:text]}"
 | 
			
		||||
          @on_mention.call(ev, meta)
 | 
			
		||||
        else
 | 
			
		||||
          @on_timeline.call(ev, meta)
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Wrapper for EM.add_timer
 | 
			
		||||
    # Delays add a greater sense of humanity to bot behaviour
 | 
			
		||||
    def delay(time, &b)
 | 
			
		||||
      time = time.to_a.sample unless time.is_a? Integer
 | 
			
		||||
      EM.add_timer(time, &b)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Reply to a tweet or a DM.
 | 
			
		||||
    # Applies configurable @reply_delay range
 | 
			
		||||
    def reply(ev, text, opts={})
 | 
			
		||||
      opts = opts.clone
 | 
			
		||||
      delay = @reply_delay.to_a.sample
 | 
			
		||||
 | 
			
		||||
      if ev.is_a? Twitter::DirectMessage
 | 
			
		||||
        log "Sending DM to @#{ev[:sender][:screen_name]}: #{text}"
 | 
			
		||||
        @twitter.direct_message_create(ev[:sender][:screen_name], text, opts)
 | 
			
		||||
      elsif ev.is_a? Twitter::Tweet
 | 
			
		||||
        log "Replying to @#{ev[:user][:screen_name]} with: #{text}"
 | 
			
		||||
        @twitter.update(text, in_reply_to_status_id: ev[:id]) 
 | 
			
		||||
      else
 | 
			
		||||
        raise Exception("Don't know how to reply to a #{ev.class}")
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def scheduler
 | 
			
		||||
      @scheduler ||= Rufus::Scheduler.new
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def follow(*args)
 | 
			
		||||
      log "Following #{args}"
 | 
			
		||||
      @twitter.follow(*args)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def tweet(*args)
 | 
			
		||||
      log "Tweeting #{args.inspect}"
 | 
			
		||||
      @twitter.update(*args)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def on_startup(&b); @on_startup = b; end
 | 
			
		||||
    def on_follow(&b); @on_follow = b; end
 | 
			
		||||
    def on_mention(&b); @on_mention = b; end
 | 
			
		||||
    def on_timeline(&b); @on_timeline = b; end
 | 
			
		||||
    def on_message(&b); @on_message = b; end
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
							
								
								
									
										81
									
								
								lib/twitter_ebooks/markov.rb
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								lib/twitter_ebooks/markov.rb
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,81 @@
 | 
			
		|||
module Ebooks
 | 
			
		||||
  # Special INTERIM token represents sentence boundaries
 | 
			
		||||
  # This is so we can include start and end of statements in model
 | 
			
		||||
  # Due to the way the sentence tokenizer works, can correspond
 | 
			
		||||
  # to multiple actual parts of text (such as ^, $, \n and .?!)
 | 
			
		||||
  INTERIM = :interim
 | 
			
		||||
 | 
			
		||||
  # This is an ngram-based Markov model optimized to build from a
 | 
			
		||||
  # tokenized sentence list without requiring too much transformation
 | 
			
		||||
  class MarkovModel
 | 
			
		||||
    def self.build(sentences)
 | 
			
		||||
      MarkovModel.new.consume(sentences)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def consume(sentences)
 | 
			
		||||
      # These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
 | 
			
		||||
      # We map by both bigrams and unigrams so we can fall back to the latter in
 | 
			
		||||
      # cases where an input bigram is unavailable, such as starting a sentence
 | 
			
		||||
      @sentences = sentences
 | 
			
		||||
      @unigrams = {}
 | 
			
		||||
      @bigrams = {}
 | 
			
		||||
 | 
			
		||||
      sentences.each_with_index do |tokens, i|
 | 
			
		||||
        last_token = INTERIM
 | 
			
		||||
        tokens.each_with_index do |token, j|
 | 
			
		||||
          @unigrams[last_token] ||= []
 | 
			
		||||
          @unigrams[last_token] << [i, j]
 | 
			
		||||
 | 
			
		||||
          @bigrams[last_token] ||= {}
 | 
			
		||||
          @bigrams[last_token][token] ||= []
 | 
			
		||||
 | 
			
		||||
          if j == tokens.length-1 # Mark sentence endings
 | 
			
		||||
            @unigrams[token] ||= []
 | 
			
		||||
            @unigrams[token] << INTERIM
 | 
			
		||||
            @bigrams[last_token][token] << INTERIM
 | 
			
		||||
          else
 | 
			
		||||
            @bigrams[last_token][token] << [i, j+1]
 | 
			
		||||
          end
 | 
			
		||||
 | 
			
		||||
          last_token = token
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      self
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def find_token(index)
 | 
			
		||||
      if index == INTERIM
 | 
			
		||||
        INTERIM
 | 
			
		||||
      else
 | 
			
		||||
        @sentences[index[0]][index[1]]
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def chain(tokens)
 | 
			
		||||
      if tokens.length == 1
 | 
			
		||||
        matches = @unigrams[tokens[0]]
 | 
			
		||||
      else
 | 
			
		||||
        matches = @bigrams[tokens[-2]][tokens[-1]]
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      if matches.empty?
 | 
			
		||||
        # This should never happen unless a strange token is
 | 
			
		||||
        # supplied from outside the dataset
 | 
			
		||||
        raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      next_token = find_token(matches.sample)
 | 
			
		||||
 | 
			
		||||
      if next_token == INTERIM # We chose to end the sentence
 | 
			
		||||
        return tokens
 | 
			
		||||
      else
 | 
			
		||||
        return chain(tokens + [next_token])
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def generate
 | 
			
		||||
      NLP.reconstruct(chain([INTERIM]))
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
							
								
								
									
										120
									
								
								lib/twitter_ebooks/model.rb
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										120
									
								
								lib/twitter_ebooks/model.rb
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,120 @@
 | 
			
		|||
#!/usr/bin/env ruby
 | 
			
		||||
# encoding: utf-8
 | 
			
		||||
 | 
			
		||||
require 'json'
 | 
			
		||||
require 'set'
 | 
			
		||||
require 'digest/md5'
 | 
			
		||||
 | 
			
		||||
module Ebooks
 | 
			
		||||
  class Model
 | 
			
		||||
    attr_accessor :hash, :sentences, :markov, :keywords
 | 
			
		||||
 | 
			
		||||
    def self.consume(txtpath)
 | 
			
		||||
      Model.new.consume(txtpath)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.load(path)
 | 
			
		||||
      Marshal.load(File.read(path))
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def consume(txtpath)
 | 
			
		||||
      # Record hash of source file so we know to update later
 | 
			
		||||
      @hash = Digest::MD5.hexdigest(File.read(txtpath))
 | 
			
		||||
 | 
			
		||||
      text = File.read(txtpath)
 | 
			
		||||
      log "Removing commented lines and mention tokens"
 | 
			
		||||
 | 
			
		||||
      lines = text.split("\n")
 | 
			
		||||
      keeping = []
 | 
			
		||||
      lines.each do |l|
 | 
			
		||||
        next if l.start_with?('#') || l.include?('RT')
 | 
			
		||||
        processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
 | 
			
		||||
        keeping << processed.join(' ')
 | 
			
		||||
      end
 | 
			
		||||
      text = NLP.normalize(keeping.join("\n"))
 | 
			
		||||
 | 
			
		||||
      log "Segmenting text into sentences"
 | 
			
		||||
 | 
			
		||||
      sentences = NLP.sentences(text)
 | 
			
		||||
 | 
			
		||||
      log "Tokenizing #{sentences.length} sentences"
 | 
			
		||||
      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
 | 
			
		||||
 | 
			
		||||
      log "Ranking keywords"
 | 
			
		||||
      @keywords = NLP.keywords(@sentences)
 | 
			
		||||
 | 
			
		||||
      self
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def save(path)
 | 
			
		||||
      File.open(path, 'w') do |f|
 | 
			
		||||
        f.write(Marshal.dump(self))
 | 
			
		||||
      end
 | 
			
		||||
      self
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def fix(tweet)
 | 
			
		||||
      # This seems to require an external api call
 | 
			
		||||
      #begin
 | 
			
		||||
      #  fixer = NLP.gingerice.parse(tweet)
 | 
			
		||||
      #  log fixer if fixer['corrections']
 | 
			
		||||
      #  tweet = fixer['result']
 | 
			
		||||
      #rescue Exception => e
 | 
			
		||||
      #  log e.message
 | 
			
		||||
      #  log e.backtrace
 | 
			
		||||
      #end
 | 
			
		||||
 | 
			
		||||
      NLP.htmlentities.decode tweet
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def markov_statement(limit=140, markov=nil)
 | 
			
		||||
      markov ||= MarkovModel.build(@sentences)
 | 
			
		||||
      tweet = ""
 | 
			
		||||
 | 
			
		||||
      while (tweet = markov.generate) do
 | 
			
		||||
        next if tweet.length > limit
 | 
			
		||||
        next if NLP.unmatched_enclosers?(tweet)
 | 
			
		||||
        break if tweet.length > limit*0.4 || rand > 0.8
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      fix tweet
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Finds all relevant tokenized sentences to given input by
 | 
			
		||||
    # comparing non-stopword token overlaps
 | 
			
		||||
    def relevant_sentences(input)
 | 
			
		||||
      relevant = []
 | 
			
		||||
      slightly_relevant = []
 | 
			
		||||
 | 
			
		||||
      tokenized = NLP.tokenize(input)
 | 
			
		||||
 | 
			
		||||
      @sentences.each do |sent|
 | 
			
		||||
        tokenized.each do |token|
 | 
			
		||||
          if sent.include?(token)
 | 
			
		||||
            relevant << sent unless NLP.stopword?(token)
 | 
			
		||||
            slightly_relevant << sent
 | 
			
		||||
          end
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      [relevant, slightly_relevant]
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Generates a response by looking for related sentences
 | 
			
		||||
    # in the corpus and building a smaller markov model from these
 | 
			
		||||
    def markov_response(input, limit=140)
 | 
			
		||||
      # First try 
 | 
			
		||||
      relevant, slightly_relevant = relevant_sentences(input)
 | 
			
		||||
 | 
			
		||||
      if relevant.length >= 3
 | 
			
		||||
        markov = MarkovModel.new.consume(relevant)
 | 
			
		||||
        markov_statement(limit, markov)
 | 
			
		||||
      elsif slightly_relevant.length > 5
 | 
			
		||||
        markov = MarkovModel.new.consume(slightly_relevant)
 | 
			
		||||
        markov_statement(limit, markov)
 | 
			
		||||
      else
 | 
			
		||||
        markov_statement(limit)
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
							
								
								
									
										154
									
								
								lib/twitter_ebooks/nlp.rb
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										154
									
								
								lib/twitter_ebooks/nlp.rb
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,154 @@
 | 
			
		|||
# encoding: utf-8
 | 
			
		||||
require 'fast-stemmer'
 | 
			
		||||
require 'highscore'
 | 
			
		||||
 | 
			
		||||
module Ebooks
 | 
			
		||||
  module NLP
 | 
			
		||||
    # We deliberately limit our punctuation handling to stuff we can do consistently
 | 
			
		||||
    # It'll just be a part of another token if we don't split it out, and that's fine
 | 
			
		||||
    PUNCTUATION = ".?!,"
 | 
			
		||||
 | 
			
		||||
    # Lazy-load NLP libraries and resources
 | 
			
		||||
    # Some of this stuff is pretty heavy and we don't necessarily need
 | 
			
		||||
    # to be using it all of the time
 | 
			
		||||
 | 
			
		||||
    def self.stopwords
 | 
			
		||||
      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.nouns
 | 
			
		||||
      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.adjectives
 | 
			
		||||
      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # POS tagger
 | 
			
		||||
    def self.tagger
 | 
			
		||||
      require 'engtagger'
 | 
			
		||||
      @tagger ||= EngTagger.new
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Gingerice text correction service
 | 
			
		||||
    def self.gingerice
 | 
			
		||||
      require 'gingerice'
 | 
			
		||||
      Gingerice::Parser.new # No caching for this one
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # For decoding html entities
 | 
			
		||||
    def self.htmlentities
 | 
			
		||||
      require 'htmlentities'
 | 
			
		||||
      @htmlentities ||= HTMLEntities.new
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    ### Utility functions
 | 
			
		||||
    
 | 
			
		||||
    # We don't really want to deal with all this weird unicode punctuation
 | 
			
		||||
    def self.normalize(text)
 | 
			
		||||
      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Split text into sentences
 | 
			
		||||
    # We use ad hoc approach because fancy libraries do not deal
 | 
			
		||||
    # especially well with tweet formatting, and we can fake solving
 | 
			
		||||
    # the quote problem during generation
 | 
			
		||||
    def self.sentences(text)
 | 
			
		||||
      text.split(/\n+|(?<=[.?!])\s+/)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Split a sentence into word-level tokens
 | 
			
		||||
    # As above, this is ad hoc because tokenization libraries
 | 
			
		||||
    # do not behave well wrt. things like emoticons and timestamps
 | 
			
		||||
    def self.tokenize(sentence)
 | 
			
		||||
      regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
 | 
			
		||||
      sentence.split(regex)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.stem(word)
 | 
			
		||||
      Stemmer::stem_word(word.downcase)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.keywords(sentences)
 | 
			
		||||
      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
 | 
			
		||||
      text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
 | 
			
		||||
 | 
			
		||||
      text = Highscore::Content.new(text)
 | 
			
		||||
 | 
			
		||||
      text.configure do
 | 
			
		||||
        #set :multiplier, 2
 | 
			
		||||
        #set :upper_case, 3
 | 
			
		||||
        #set :long_words, 2
 | 
			
		||||
        #set :long_words_threshold, 15
 | 
			
		||||
        #set :vowels, 1                     # => default: 0 = not considered
 | 
			
		||||
        #set :consonants, 5                 # => default: 0 = not considered
 | 
			
		||||
        #set :ignore_case, true             # => default: false
 | 
			
		||||
        set :word_pattern, /(?<!@)(?<=\s)[\w']+/           # => default: /\w+/
 | 
			
		||||
        #set :stemming, true                # => default: false
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      text.keywords
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Takes a list of tokens and builds a nice-looking sentence
 | 
			
		||||
    def self.reconstruct(tokens)
 | 
			
		||||
      text = ""
 | 
			
		||||
      last_token = nil
 | 
			
		||||
      tokens.each do |token|
 | 
			
		||||
        next if token == INTERIM
 | 
			
		||||
        text += ' ' if last_token && space_between?(last_token, token)
 | 
			
		||||
        text += token
 | 
			
		||||
        last_token = token
 | 
			
		||||
      end
 | 
			
		||||
      text
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Determine if we need to insert a space between two tokens
 | 
			
		||||
    def self.space_between?(token1, token2)
 | 
			
		||||
      p1 = self.punctuation?(token1)
 | 
			
		||||
      p2 = self.punctuation?(token2)
 | 
			
		||||
      if p1 && p2 # "foo?!"
 | 
			
		||||
        false
 | 
			
		||||
      elsif !p1 && p2 # "foo."
 | 
			
		||||
        false
 | 
			
		||||
      elsif p1 && !p2 # "foo. rah"
 | 
			
		||||
        true
 | 
			
		||||
      else # "foo rah"
 | 
			
		||||
        true
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.punctuation?(token)
 | 
			
		||||
      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.stopword?(token)
 | 
			
		||||
      @stopword_set ||= stopwords.map(&:downcase).to_set
 | 
			
		||||
      @stopword_set.include?(token.downcase)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Determine if a sample of text contains unmatched brackets or quotes
 | 
			
		||||
    # This is one of the more frequent and noticeable failure modes for
 | 
			
		||||
    # the markov generator; we can just tell it to retry
 | 
			
		||||
    def self.unmatched_enclosers?(text)
 | 
			
		||||
      enclosers = ['**', '""', '()', '[]', '``', "''"]
 | 
			
		||||
      enclosers.each do |pair|
 | 
			
		||||
        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
 | 
			
		||||
        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
 | 
			
		||||
 | 
			
		||||
        opened = 0
 | 
			
		||||
 | 
			
		||||
        tokenize(text).each do |token|
 | 
			
		||||
          opened += 1 if token.match(starter)
 | 
			
		||||
          opened -= 1 if token.match(ender)
 | 
			
		||||
 | 
			
		||||
          return true if opened < 0 # Too many ends!
 | 
			
		||||
        end
 | 
			
		||||
 | 
			
		||||
        return true if opened != 0 # Mismatch somewhere.
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      false
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
							
								
								
									
										3
									
								
								lib/twitter_ebooks/version.rb
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								lib/twitter_ebooks/version.rb
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,3 @@
 | 
			
		|||
module Ebooks
 | 
			
		||||
  VERSION = "2.0.7"
 | 
			
		||||
end
 | 
			
		||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue