diff --git a/lib/twitter_ebooks.rb b/lib/twitter_ebooks.rb index 830fbcc..ea857f4 100644 --- a/lib/twitter_ebooks.rb +++ b/lib/twitter_ebooks.rb @@ -1,7 +1,7 @@ $debug = false def log(*args) - STDERR.puts args.map(&:to_s).join(' ') + STDERR.print args.map(&:to_s).join(' ') + "\n" STDERR.flush end diff --git a/lib/twitter_ebooks/bot.rb b/lib/twitter_ebooks/bot.rb index d2546ba..149968b 100755 --- a/lib/twitter_ebooks/bot.rb +++ b/lib/twitter_ebooks/bot.rb @@ -1,51 +1,8 @@ -#!/usr/bin/env ruby # encoding: utf-8 require 'twitter' require 'rufus/scheduler' -require 'eventmachine' module Ebooks - # Wrap SSLSocket so that readpartial yields the fiber instead of - # blocking when there is no data - # - # We hand this to the twitter library so we can select on the sockets - # and thus run multiple streams without them blocking - class FiberSSLSocket - def initialize(*args) - @socket = OpenSSL::SSL::SSLSocket.new(*args) - end - - def readpartial(maxlen) - data = "" - - loop do - begin - data = @socket.read_nonblock(maxlen) - rescue IO::WaitReadable - end - break if data.length > 0 - Fiber.yield(@socket) - end - - data - end - - def method_missing(m, *args) - @socket.send(m, *args) - end - end - - # An EventMachine handler which resumes a fiber on incoming data - class FiberSocketHandler < EventMachine::Connection - def initialize(fiber) - @fiber = fiber - end - - def notify_readable - @fiber.resume - end - end - class ConfigurationError < Exception end @@ -106,7 +63,7 @@ module Ebooks attr_accessor :consumer_key, :consumer_secret, :access_token, :access_token_secret - attr_reader :twitter, :stream + attr_reader :twitter, :stream, :thread # Configuration attr_accessor :username, :delay_range, :blacklist @@ -119,7 +76,7 @@ module Ebooks end def log(*args) - STDOUT.puts "@#{@username}: " + args.map(&:to_s).join(' ') + STDOUT.print "@#{@username}: " + args.map(&:to_s).join(' ') + "\n" STDOUT.flush end @@ -154,9 +111,7 @@ module Ebooks config.access_token_secret = @access_token_secret end - @stream = Twitter::Streaming::Client.new( - ssl_socket_class: FiberSSLSocket - ) do |config| + @stream = Twitter::Streaming::Client.new do |config| config.consumer_key = @consumer_key config.consumer_secret = @consumer_secret config.access_token = @access_token @@ -239,14 +194,13 @@ module Ebooks end def start_stream - log "starting stream for #@username" + log "starting tweet stream" @stream.user do |ev| receive_event ev end end - # Connects to tweetstream and opens event handlers for this bot - def start + def prepare # Sanity check if @username.nil? raise ConfigurationError, "bot.username cannot be nil" @@ -254,15 +208,11 @@ module Ebooks make_client fire(:startup) + end - fiber = Fiber.new do - start_stream - end - - socket = fiber.resume - - conn = EM.watch socket.io, FiberSocketHandler, fiber - conn.notify_readable = true + # Connects to tweetstream and opens event handlers for this bot + def start + start_stream end # Fire an event diff --git a/lib/twitter_ebooks/model.rb b/lib/twitter_ebooks/model.rb index 0f1bbad..fd5ff7d 100644 --- a/lib/twitter_ebooks/model.rb +++ b/lib/twitter_ebooks/model.rb @@ -4,14 +4,33 @@ require 'json' require 'set' require 'digest/md5' +require 'fileutils' require 'csv' module Ebooks class Model attr_accessor :hash, :tokens, :sentences, :mentions, :keywords - def self.consume(txtpath) - Model.new.consume(txtpath) + # Consume a corpus file to create a model + # @param corpus_path Path to a json, text or csv file to consume + # @param cache Optional path to a directory to store cached models + def self.consume(corpus_path, cache: nil) + if cache + FileUtils::mkdir_p cache + + cache_path = File.join(cache, Digest::MD5.file(corpus_path).to_s) + if File.exists?(cache_path) + log "Reading model from cache at #{cache_path}" + return Model.load(cache_path) + end + end + + model = Model.new.consume(corpus_path) + + if cache + log "Caching model at #{cache_path}" + model.save(cache_path) + end end def self.consume_all(paths)