2.1.3 - better archiver

This commit is contained in:
Mispy 2013-11-24 13:16:34 -08:00
parent c3053e5091
commit acc2f42b38
7 changed files with 124 additions and 90 deletions

View file

@ -1,4 +0,0 @@
- Files in text/ are preprocessed by `rake consume` and serialized
- e.g. text/foo.tweets becomes consumed/foo.corpus
- `rake consume` looks at hashes to know which it needs to update
- Preprocessed corpus files are loaded at runtime by Corpus.load('foo')

View file

@ -1,4 +1,4 @@
# twitter\_ebooks 2.1.2 # twitter\_ebooks 2.1.3
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting. Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.

View file

@ -60,7 +60,7 @@ module Ebooks
end end
def self.archive(username, outpath) def self.archive(username, outpath)
Archiver.new(username, outpath).fetch_tweets Archive.new(username, outpath).sync
end end
def self.tweet(modelpath, username) def self.tweet(modelpath, username)
@ -73,6 +73,31 @@ module Ebooks
bot.tweet(statement) bot.tweet(statement)
end end
def self.jsonify(old_path, new_path)
name = File.basename(old_path).split('.')[0]
new_path ||= name + ".json"
tweets = []
id = nil
File.read(old_path).split("\n").each do |l|
if l.start_with?('# ')
id = l.split('# ')[-1]
else
tweet = { text: l }
if id
tweet[:id] = id
id = nil
end
tweets << tweet
end
end
File.open(new_path, 'w') do |f|
log "Writing #{tweets.length} tweets to #{new_path}"
f.write(JSON.pretty_generate(tweets))
end
end
def self.command(args) def self.command(args)
usage = """Usage: usage = """Usage:
ebooks new <reponame> ebooks new <reponame>
@ -81,6 +106,7 @@ module Ebooks
ebooks score <model_path> <input> ebooks score <model_path> <input>
ebooks archive <@user> <outpath> ebooks archive <@user> <outpath>
ebooks tweet <model_path> <@bot> ebooks tweet <model_path> <@bot>
ebooks jsonify <old_corpus_path> [new_corpus_path]
""" """
if args.length == 0 if args.length == 0
@ -95,6 +121,7 @@ module Ebooks
when "score" then score(args[1], args[2..-1].join(' ')) when "score" then score(args[1], args[2..-1].join(' '))
when "archive" then archive(args[1], args[2]) when "archive" then archive(args[1], args[2])
when "tweet" then tweet(args[1], args[2]) when "tweet" then tweet(args[1], args[2])
when "jsonify" then jsonify(args[1], args[2])
end end
end end
end end

View file

@ -16,7 +16,7 @@ module Ebooks
end end
require 'twitter_ebooks/nlp' require 'twitter_ebooks/nlp'
require 'twitter_ebooks/archiver' require 'twitter_ebooks/archive'
require 'twitter_ebooks/markov' require 'twitter_ebooks/markov'
require 'twitter_ebooks/suffix' require 'twitter_ebooks/suffix'
require 'twitter_ebooks/model' require 'twitter_ebooks/model'

View file

@ -0,0 +1,93 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'twitter'
require 'json'
CONFIG_PATH = "/home/#{ENV['USER']}/.ebooksrc"
module Ebooks
class Archive
attr_reader :tweets
def make_client
if File.exists?(CONFIG_PATH)
@config = JSON.parse(File.read(CONFIG_PATH), symbolize_names: true)
else
@config = {}
puts "As Twitter no longer allows anonymous API access, you'll need to enter the auth details of any account to use for archiving. These will be stored in #{CONFIG_PATH} if you need to change them later."
print "Consumer key: "
@config[:consumer_key] = STDIN.gets.chomp
print "Consumer secret: "
@config[:consumer_secret] = STDIN.gets.chomp
print "Oauth token: "
@config[:oauth_token] = STDIN.gets.chomp
print "Oauth secret: "
@config[:oauth_token_secret] = STDIN.gets.chomp
File.open(CONFIG_PATH, 'w') do |f|
f.write(JSON.pretty_generate(@config))
end
end
Twitter.configure do |config|
config.consumer_key = @config[:consumer_key]
config.consumer_secret = @config[:consumer_secret]
config.oauth_token = @config[:oauth_token]
config.oauth_token_secret = @config[:oauth_token_secret]
end
Twitter::Client.new
end
def initialize(username, path, client=nil)
@username = username
@path = path || "#{username}.json"
@client = client || make_client
if File.exists?(@path)
@tweets = JSON.parse(File.read(@path), symbolize_names: true)
log "Currently #{@tweets.length} tweets for #{@username}"
else
@tweets.nil?
log "New archive for @#{username} at #{@path}"
end
end
def sync
retries = 0
tweets = []
max_id = nil
opts = {
count: 200,
#include_rts: false,
trim_user: true
}
opts[:since_id] = @tweets[0][:id] unless @tweets.nil?
loop do
opts[:max_id] = max_id unless max_id.nil?
new = @client.user_timeline(@username, opts)
break if new.length <= 1
tweets += new
puts "Received #{tweets.length} new tweets"
max_id = new.last.id
end
if tweets.length == 0
log "No new tweets"
else
@tweets ||= []
@tweets = tweets.map(&:attrs).each { |tw|
tw.delete(:entities)
} + @tweets
File.open(@path, 'w') do |f|
f.write(JSON.pretty_generate(@tweets))
end
end
end
end
end

View file

@ -1,82 +0,0 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'twitter'
module Ebooks
class Archiver
def initialize(username, outpath)
@username = username
@outpath = outpath
@client = Twitter::Client.new
end
# Read exiting corpus into memory.
# Return list of tweet lines and the last tweet id.
def read_corpus
lines = []
since_id = nil
if File.exists?(@outpath)
lines = File.read(@outpath).split("\n")
if lines[0].start_with?('#')
since_id = lines[0].split('# ').last
end
end
[lines, since_id]
end
# Retrieve all available tweets for a given user since the last tweet id
def tweets_since(since_id)
page = 1
retries = 0
tweets = []
max_id = nil
opts = {
count: 200,
include_rts: false,
trim_user: true
}
opts[:since_id] = since_id unless since_id.nil?
loop do
opts[:max_id] = max_id unless max_id.nil?
new = @client.user_timeline(@username, opts)
break if new.length <= 1
puts "Received #{new.length} tweets"
tweets += new
max_id = new.last.id
break
end
tweets
end
def fetch_tweets
lines, since_id = read_corpus
if since_id.nil?
puts "Retrieving tweets from @#{@username}"
else
puts "Retrieving tweets from @#{@username} since #{since_id}"
end
tweets = tweets_since(since_id)
if tweets.length == 0
puts "No new tweets"
return
end
new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
new_since_id = tweets[0].id.to_s
lines = ["# " + new_since_id] + new_lines + lines
corpus = File.open(@outpath, 'w')
corpus.write(lines.join("\n"))
corpus.close
end
end
end

View file

@ -1,3 +1,3 @@
module Ebooks module Ebooks
VERSION = "2.1.2" VERSION = "2.1.3"
end end