From acc2f42b381c72fd7e289cf0a30204a7b579aa7e Mon Sep 17 00:00:00 2001 From: Mispy <^_^@mispy.me> Date: Sun, 24 Nov 2013 13:16:34 -0800 Subject: [PATCH] 2.1.3 - better archiver --- NOTES.md | 4 -- README.md | 2 +- bin/ebooks | 29 ++++++++++- lib/twitter_ebooks.rb | 2 +- lib/twitter_ebooks/archive.rb | 93 ++++++++++++++++++++++++++++++++++ lib/twitter_ebooks/archiver.rb | 82 ------------------------------ lib/twitter_ebooks/version.rb | 2 +- 7 files changed, 124 insertions(+), 90 deletions(-) delete mode 100755 NOTES.md create mode 100644 lib/twitter_ebooks/archive.rb delete mode 100755 lib/twitter_ebooks/archiver.rb diff --git a/NOTES.md b/NOTES.md deleted file mode 100755 index d6a50b3..0000000 --- a/NOTES.md +++ /dev/null @@ -1,4 +0,0 @@ -- Files in text/ are preprocessed by `rake consume` and serialized -- e.g. text/foo.tweets becomes consumed/foo.corpus -- `rake consume` looks at hashes to know which it needs to update -- Preprocessed corpus files are loaded at runtime by Corpus.load('foo') diff --git a/README.md b/README.md index ce702f8..5d79ea5 100755 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# twitter\_ebooks 2.1.2 +# twitter\_ebooks 2.1.3 Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting. diff --git a/bin/ebooks b/bin/ebooks index 3baee71..25b2237 100755 --- a/bin/ebooks +++ b/bin/ebooks @@ -60,7 +60,7 @@ module Ebooks end def self.archive(username, outpath) - Archiver.new(username, outpath).fetch_tweets + Archive.new(username, outpath).sync end def self.tweet(modelpath, username) @@ -73,6 +73,31 @@ module Ebooks bot.tweet(statement) end + def self.jsonify(old_path, new_path) + name = File.basename(old_path).split('.')[0] + new_path ||= name + ".json" + + tweets = [] + id = nil + File.read(old_path).split("\n").each do |l| + if l.start_with?('# ') + id = l.split('# ')[-1] + else + tweet = { text: l } + if id + tweet[:id] = id + id = nil + end + tweets << tweet + end + end + + File.open(new_path, 'w') do |f| + log "Writing #{tweets.length} tweets to #{new_path}" + f.write(JSON.pretty_generate(tweets)) + end + end + def self.command(args) usage = """Usage: ebooks new @@ -81,6 +106,7 @@ module Ebooks ebooks score ebooks archive <@user> ebooks tweet <@bot> + ebooks jsonify [new_corpus_path] """ if args.length == 0 @@ -95,6 +121,7 @@ module Ebooks when "score" then score(args[1], args[2..-1].join(' ')) when "archive" then archive(args[1], args[2]) when "tweet" then tweet(args[1], args[2]) + when "jsonify" then jsonify(args[1], args[2]) end end end diff --git a/lib/twitter_ebooks.rb b/lib/twitter_ebooks.rb index 7a47976..7334bac 100755 --- a/lib/twitter_ebooks.rb +++ b/lib/twitter_ebooks.rb @@ -16,7 +16,7 @@ module Ebooks end require 'twitter_ebooks/nlp' -require 'twitter_ebooks/archiver' +require 'twitter_ebooks/archive' require 'twitter_ebooks/markov' require 'twitter_ebooks/suffix' require 'twitter_ebooks/model' diff --git a/lib/twitter_ebooks/archive.rb b/lib/twitter_ebooks/archive.rb new file mode 100644 index 0000000..178be85 --- /dev/null +++ b/lib/twitter_ebooks/archive.rb @@ -0,0 +1,93 @@ +#!/usr/bin/env ruby +# encoding: utf-8 + +require 'twitter' +require 'json' + +CONFIG_PATH = "/home/#{ENV['USER']}/.ebooksrc" + +module Ebooks + class Archive + attr_reader :tweets + + def make_client + if File.exists?(CONFIG_PATH) + @config = JSON.parse(File.read(CONFIG_PATH), symbolize_names: true) + else + @config = {} + + puts "As Twitter no longer allows anonymous API access, you'll need to enter the auth details of any account to use for archiving. These will be stored in #{CONFIG_PATH} if you need to change them later." + print "Consumer key: " + @config[:consumer_key] = STDIN.gets.chomp + print "Consumer secret: " + @config[:consumer_secret] = STDIN.gets.chomp + print "Oauth token: " + @config[:oauth_token] = STDIN.gets.chomp + print "Oauth secret: " + @config[:oauth_token_secret] = STDIN.gets.chomp + + File.open(CONFIG_PATH, 'w') do |f| + f.write(JSON.pretty_generate(@config)) + end + end + + Twitter.configure do |config| + config.consumer_key = @config[:consumer_key] + config.consumer_secret = @config[:consumer_secret] + config.oauth_token = @config[:oauth_token] + config.oauth_token_secret = @config[:oauth_token_secret] + end + + Twitter::Client.new + end + + def initialize(username, path, client=nil) + @username = username + @path = path || "#{username}.json" + @client = client || make_client + + if File.exists?(@path) + @tweets = JSON.parse(File.read(@path), symbolize_names: true) + log "Currently #{@tweets.length} tweets for #{@username}" + else + @tweets.nil? + log "New archive for @#{username} at #{@path}" + end + end + + def sync + retries = 0 + tweets = [] + max_id = nil + + opts = { + count: 200, + #include_rts: false, + trim_user: true + } + + opts[:since_id] = @tweets[0][:id] unless @tweets.nil? + + loop do + opts[:max_id] = max_id unless max_id.nil? + new = @client.user_timeline(@username, opts) + break if new.length <= 1 + tweets += new + puts "Received #{tweets.length} new tweets" + max_id = new.last.id + end + + if tweets.length == 0 + log "No new tweets" + else + @tweets ||= [] + @tweets = tweets.map(&:attrs).each { |tw| + tw.delete(:entities) + } + @tweets + File.open(@path, 'w') do |f| + f.write(JSON.pretty_generate(@tweets)) + end + end + end + end +end diff --git a/lib/twitter_ebooks/archiver.rb b/lib/twitter_ebooks/archiver.rb deleted file mode 100755 index 93387fc..0000000 --- a/lib/twitter_ebooks/archiver.rb +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env ruby -# encoding: utf-8 - -require 'twitter' - -module Ebooks - class Archiver - def initialize(username, outpath) - @username = username - @outpath = outpath - @client = Twitter::Client.new - end - - # Read exiting corpus into memory. - # Return list of tweet lines and the last tweet id. - def read_corpus - lines = [] - since_id = nil - - if File.exists?(@outpath) - lines = File.read(@outpath).split("\n") - if lines[0].start_with?('#') - since_id = lines[0].split('# ').last - end - end - - [lines, since_id] - end - - # Retrieve all available tweets for a given user since the last tweet id - def tweets_since(since_id) - page = 1 - retries = 0 - tweets = [] - max_id = nil - - opts = { - count: 200, - include_rts: false, - trim_user: true - } - - opts[:since_id] = since_id unless since_id.nil? - - loop do - opts[:max_id] = max_id unless max_id.nil? - new = @client.user_timeline(@username, opts) - break if new.length <= 1 - puts "Received #{new.length} tweets" - tweets += new - max_id = new.last.id - break - end - - tweets - end - - def fetch_tweets - lines, since_id = read_corpus - - if since_id.nil? - puts "Retrieving tweets from @#{@username}" - else - puts "Retrieving tweets from @#{@username} since #{since_id}" - end - - tweets = tweets_since(since_id) - - if tweets.length == 0 - puts "No new tweets" - return - end - - new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") } - new_since_id = tweets[0].id.to_s - lines = ["# " + new_since_id] + new_lines + lines - corpus = File.open(@outpath, 'w') - corpus.write(lines.join("\n")) - corpus.close - end - end -end diff --git a/lib/twitter_ebooks/version.rb b/lib/twitter_ebooks/version.rb index d2973b8..9668313 100755 --- a/lib/twitter_ebooks/version.rb +++ b/lib/twitter_ebooks/version.rb @@ -1,3 +1,3 @@ module Ebooks - VERSION = "2.1.2" + VERSION = "2.1.3" end