2.1.3 - better archiver

2013-11-24 13:16:34 -08:00 · 2013-11-24 13:16:34 -08:00 · acc2f42b38
commit acc2f42b38
parent c3053e5091
7 changed files with 124 additions and 90 deletions
--- a/NOTES.md
+++ b/NOTES.md
@ -1,4 +0,0 @@
- Files in text/ are preprocessed by `rake consume` and serialized
- e.g. text/foo.tweets becomes consumed/foo.corpus
- `rake consume` looks at hashes to know which it needs to update
- Preprocessed corpus files are loaded at runtime by Corpus.load('foo')
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# twitter\_ebooks 2.1.2
+# twitter\_ebooks 2.1.3

 Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.

--- a/bin/ebooks
+++ b/bin/ebooks
@ -60,7 +60,7 @@ module Ebooks
  end

  def self.archive(username, outpath)
-    Archiver.new(username, outpath).fetch_tweets
+    Archive.new(username, outpath).sync
  end

  def self.tweet(modelpath, username)
@ -73,6 +73,31 @@ module Ebooks
    bot.tweet(statement)
  end

+  def self.jsonify(old_path, new_path)
+    name = File.basename(old_path).split('.')[0]
+    new_path ||= name + ".json"
+
+    tweets = []
+    id = nil
+    File.read(old_path).split("\n").each do |l|
+      if l.start_with?('# ')
+        id = l.split('# ')[-1]
+      else
+        tweet = { text: l }
+        if id
+          tweet[:id] = id
+          id = nil
+        end
+        tweets << tweet
+      end
+    end
+
+    File.open(new_path, 'w') do |f|
+      log "Writing #{tweets.length} tweets to #{new_path}"
+      f.write(JSON.pretty_generate(tweets))
+    end
+  end
+
  def self.command(args)
    usage = """Usage: 
     ebooks new <reponame>
@ -81,6 +106,7 @@ module Ebooks
     ebooks score <model_path> <input>
     ebooks archive <@user> <outpath>
     ebooks tweet <model_path> <@bot>
+     ebooks jsonify <old_corpus_path> [new_corpus_path]
 """

    if args.length == 0
@ -95,6 +121,7 @@ module Ebooks
    when "score" then score(args[1], args[2..-1].join(' '))
    when "archive" then archive(args[1], args[2])
    when "tweet" then tweet(args[1], args[2])
+    when "jsonify" then jsonify(args[1], args[2])
    end
  end
 end
--- a/lib/twitter_ebooks.rb
+++ b/lib/twitter_ebooks.rb
@ -16,7 +16,7 @@ module Ebooks
 end

 require 'twitter_ebooks/nlp'
-require 'twitter_ebooks/archiver'
+require 'twitter_ebooks/archive'
 require 'twitter_ebooks/markov'
 require 'twitter_ebooks/suffix'
 require 'twitter_ebooks/model'
--- a/lib/twitter_ebooks/archive.rb
+++ b/lib/twitter_ebooks/archive.rb
@ -0,0 +1,93 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+
+require 'twitter'
+require 'json'
+
+CONFIG_PATH = "/home/#{ENV['USER']}/.ebooksrc"
+
+module Ebooks
+  class Archive
+    attr_reader :tweets
+
+    def make_client
+      if File.exists?(CONFIG_PATH)
+        @config = JSON.parse(File.read(CONFIG_PATH), symbolize_names: true)
+      else
+        @config = {}
+
+        puts "As Twitter no longer allows anonymous API access, you'll need to enter the auth details of any account to use for archiving. These will be stored in #{CONFIG_PATH} if you need to change them later."
+        print "Consumer key: "
+        @config[:consumer_key] = STDIN.gets.chomp
+        print "Consumer secret: "
+        @config[:consumer_secret] = STDIN.gets.chomp
+        print "Oauth token: "
+        @config[:oauth_token] = STDIN.gets.chomp
+        print "Oauth secret: "
+        @config[:oauth_token_secret] = STDIN.gets.chomp
+
+        File.open(CONFIG_PATH, 'w') do |f|
+          f.write(JSON.pretty_generate(@config))
+        end
+      end
+
+      Twitter.configure do |config|
+        config.consumer_key = @config[:consumer_key]
+        config.consumer_secret = @config[:consumer_secret]
+        config.oauth_token = @config[:oauth_token]
+        config.oauth_token_secret = @config[:oauth_token_secret]
+      end
+
+      Twitter::Client.new
+    end
+
+    def initialize(username, path, client=nil)
+      @username = username
+      @path = path || "#{username}.json"
+      @client = client || make_client
+
+      if File.exists?(@path)
+        @tweets = JSON.parse(File.read(@path), symbolize_names: true)
+        log "Currently #{@tweets.length} tweets for #{@username}"
+      else
+        @tweets.nil?
+        log "New archive for @#{username} at #{@path}"
+      end
+    end
+
+    def sync
+      retries = 0
+      tweets = []
+      max_id = nil
+
+      opts = {
+        count: 200,
+        #include_rts: false,
+        trim_user: true
+      }
+
+      opts[:since_id] = @tweets[0][:id] unless @tweets.nil?
+
+      loop do
+        opts[:max_id] = max_id unless max_id.nil?
+        new = @client.user_timeline(@username, opts)
+        break if new.length <= 1
+        tweets += new
+        puts "Received #{tweets.length} new tweets"
+        max_id = new.last.id
+      end
+
+      if tweets.length == 0
+        log "No new tweets"
+      else
+        @tweets ||= []
+        @tweets = tweets.map(&:attrs).each { |tw|
+          tw.delete(:entities)
+        } + @tweets
+        File.open(@path, 'w') do |f|
+          f.write(JSON.pretty_generate(@tweets))
+        end
+      end
+    end
+  end
+end
--- a/lib/twitter_ebooks/archiver.rb
+++ b/lib/twitter_ebooks/archiver.rb
@ -1,82 +0,0 @@
-#!/usr/bin/env ruby
-# encoding: utf-8
-
-require 'twitter'
-
-module Ebooks
-  class Archiver
-    def initialize(username, outpath)
-      @username = username
-      @outpath = outpath
-      @client = Twitter::Client.new
-    end
-
-    # Read exiting corpus into memory.
-    # Return list of tweet lines and the last tweet id.
-    def read_corpus
-      lines = []
-      since_id = nil
-
-      if File.exists?(@outpath)
-        lines = File.read(@outpath).split("\n")
-        if lines[0].start_with?('#')
-          since_id = lines[0].split('# ').last
-        end
-      end
-
-      [lines, since_id]
-    end
-
-    # Retrieve all available tweets for a given user since the last tweet id
-    def tweets_since(since_id)
-      page = 1
-      retries = 0
-      tweets = []
-      max_id = nil
-
-      opts = {
-        count: 200,
-        include_rts: false,
-        trim_user: true
-      }
-
-      opts[:since_id] = since_id unless since_id.nil?
-
-      loop do
-        opts[:max_id] = max_id unless max_id.nil?
-        new = @client.user_timeline(@username, opts)
-        break if new.length <= 1
-        puts "Received #{new.length} tweets"
-        tweets += new
-        max_id = new.last.id
-        break
-      end
-
-      tweets
-    end
-
-    def fetch_tweets
-      lines, since_id = read_corpus
-
-      if since_id.nil?
-        puts "Retrieving tweets from @#{@username}"
-      else
-        puts "Retrieving tweets from @#{@username} since #{since_id}"
-      end
-
-      tweets = tweets_since(since_id)
-
-      if tweets.length == 0
-        puts "No new tweets"
-        return
-      end
-
-      new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
-      new_since_id = tweets[0].id.to_s
-      lines = ["# " + new_since_id] + new_lines + lines
-      corpus = File.open(@outpath, 'w')
-      corpus.write(lines.join("\n"))
-      corpus.close
-    end
-  end
-end
--- a/lib/twitter_ebooks/version.rb
+++ b/lib/twitter_ebooks/version.rb
@ -1,3 +1,3 @@
 module Ebooks
-  VERSION = "2.1.2"
+  VERSION = "2.1.3"
 end