2.1.3 - better archiver
This commit is contained in:
parent
c3053e5091
commit
acc2f42b38
7 changed files with 124 additions and 90 deletions
4
NOTES.md
4
NOTES.md
|
@ -1,4 +0,0 @@
|
||||||
- Files in text/ are preprocessed by `rake consume` and serialized
|
|
||||||
- e.g. text/foo.tweets becomes consumed/foo.corpus
|
|
||||||
- `rake consume` looks at hashes to know which it needs to update
|
|
||||||
- Preprocessed corpus files are loaded at runtime by Corpus.load('foo')
|
|
|
@ -1,4 +1,4 @@
|
||||||
# twitter\_ebooks 2.1.2
|
# twitter\_ebooks 2.1.3
|
||||||
|
|
||||||
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
|
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
|
||||||
|
|
||||||
|
|
29
bin/ebooks
29
bin/ebooks
|
@ -60,7 +60,7 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.archive(username, outpath)
|
def self.archive(username, outpath)
|
||||||
Archiver.new(username, outpath).fetch_tweets
|
Archive.new(username, outpath).sync
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.tweet(modelpath, username)
|
def self.tweet(modelpath, username)
|
||||||
|
@ -73,6 +73,31 @@ module Ebooks
|
||||||
bot.tweet(statement)
|
bot.tweet(statement)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.jsonify(old_path, new_path)
|
||||||
|
name = File.basename(old_path).split('.')[0]
|
||||||
|
new_path ||= name + ".json"
|
||||||
|
|
||||||
|
tweets = []
|
||||||
|
id = nil
|
||||||
|
File.read(old_path).split("\n").each do |l|
|
||||||
|
if l.start_with?('# ')
|
||||||
|
id = l.split('# ')[-1]
|
||||||
|
else
|
||||||
|
tweet = { text: l }
|
||||||
|
if id
|
||||||
|
tweet[:id] = id
|
||||||
|
id = nil
|
||||||
|
end
|
||||||
|
tweets << tweet
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
File.open(new_path, 'w') do |f|
|
||||||
|
log "Writing #{tweets.length} tweets to #{new_path}"
|
||||||
|
f.write(JSON.pretty_generate(tweets))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def self.command(args)
|
def self.command(args)
|
||||||
usage = """Usage:
|
usage = """Usage:
|
||||||
ebooks new <reponame>
|
ebooks new <reponame>
|
||||||
|
@ -81,6 +106,7 @@ module Ebooks
|
||||||
ebooks score <model_path> <input>
|
ebooks score <model_path> <input>
|
||||||
ebooks archive <@user> <outpath>
|
ebooks archive <@user> <outpath>
|
||||||
ebooks tweet <model_path> <@bot>
|
ebooks tweet <model_path> <@bot>
|
||||||
|
ebooks jsonify <old_corpus_path> [new_corpus_path]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if args.length == 0
|
if args.length == 0
|
||||||
|
@ -95,6 +121,7 @@ module Ebooks
|
||||||
when "score" then score(args[1], args[2..-1].join(' '))
|
when "score" then score(args[1], args[2..-1].join(' '))
|
||||||
when "archive" then archive(args[1], args[2])
|
when "archive" then archive(args[1], args[2])
|
||||||
when "tweet" then tweet(args[1], args[2])
|
when "tweet" then tweet(args[1], args[2])
|
||||||
|
when "jsonify" then jsonify(args[1], args[2])
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -16,7 +16,7 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
require 'twitter_ebooks/nlp'
|
require 'twitter_ebooks/nlp'
|
||||||
require 'twitter_ebooks/archiver'
|
require 'twitter_ebooks/archive'
|
||||||
require 'twitter_ebooks/markov'
|
require 'twitter_ebooks/markov'
|
||||||
require 'twitter_ebooks/suffix'
|
require 'twitter_ebooks/suffix'
|
||||||
require 'twitter_ebooks/model'
|
require 'twitter_ebooks/model'
|
||||||
|
|
93
lib/twitter_ebooks/archive.rb
Normal file
93
lib/twitter_ebooks/archive.rb
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
#!/usr/bin/env ruby
|
||||||
|
# encoding: utf-8
|
||||||
|
|
||||||
|
require 'twitter'
|
||||||
|
require 'json'
|
||||||
|
|
||||||
|
CONFIG_PATH = "/home/#{ENV['USER']}/.ebooksrc"
|
||||||
|
|
||||||
|
module Ebooks
|
||||||
|
class Archive
|
||||||
|
attr_reader :tweets
|
||||||
|
|
||||||
|
def make_client
|
||||||
|
if File.exists?(CONFIG_PATH)
|
||||||
|
@config = JSON.parse(File.read(CONFIG_PATH), symbolize_names: true)
|
||||||
|
else
|
||||||
|
@config = {}
|
||||||
|
|
||||||
|
puts "As Twitter no longer allows anonymous API access, you'll need to enter the auth details of any account to use for archiving. These will be stored in #{CONFIG_PATH} if you need to change them later."
|
||||||
|
print "Consumer key: "
|
||||||
|
@config[:consumer_key] = STDIN.gets.chomp
|
||||||
|
print "Consumer secret: "
|
||||||
|
@config[:consumer_secret] = STDIN.gets.chomp
|
||||||
|
print "Oauth token: "
|
||||||
|
@config[:oauth_token] = STDIN.gets.chomp
|
||||||
|
print "Oauth secret: "
|
||||||
|
@config[:oauth_token_secret] = STDIN.gets.chomp
|
||||||
|
|
||||||
|
File.open(CONFIG_PATH, 'w') do |f|
|
||||||
|
f.write(JSON.pretty_generate(@config))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
Twitter.configure do |config|
|
||||||
|
config.consumer_key = @config[:consumer_key]
|
||||||
|
config.consumer_secret = @config[:consumer_secret]
|
||||||
|
config.oauth_token = @config[:oauth_token]
|
||||||
|
config.oauth_token_secret = @config[:oauth_token_secret]
|
||||||
|
end
|
||||||
|
|
||||||
|
Twitter::Client.new
|
||||||
|
end
|
||||||
|
|
||||||
|
def initialize(username, path, client=nil)
|
||||||
|
@username = username
|
||||||
|
@path = path || "#{username}.json"
|
||||||
|
@client = client || make_client
|
||||||
|
|
||||||
|
if File.exists?(@path)
|
||||||
|
@tweets = JSON.parse(File.read(@path), symbolize_names: true)
|
||||||
|
log "Currently #{@tweets.length} tweets for #{@username}"
|
||||||
|
else
|
||||||
|
@tweets.nil?
|
||||||
|
log "New archive for @#{username} at #{@path}"
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def sync
|
||||||
|
retries = 0
|
||||||
|
tweets = []
|
||||||
|
max_id = nil
|
||||||
|
|
||||||
|
opts = {
|
||||||
|
count: 200,
|
||||||
|
#include_rts: false,
|
||||||
|
trim_user: true
|
||||||
|
}
|
||||||
|
|
||||||
|
opts[:since_id] = @tweets[0][:id] unless @tweets.nil?
|
||||||
|
|
||||||
|
loop do
|
||||||
|
opts[:max_id] = max_id unless max_id.nil?
|
||||||
|
new = @client.user_timeline(@username, opts)
|
||||||
|
break if new.length <= 1
|
||||||
|
tweets += new
|
||||||
|
puts "Received #{tweets.length} new tweets"
|
||||||
|
max_id = new.last.id
|
||||||
|
end
|
||||||
|
|
||||||
|
if tweets.length == 0
|
||||||
|
log "No new tweets"
|
||||||
|
else
|
||||||
|
@tweets ||= []
|
||||||
|
@tweets = tweets.map(&:attrs).each { |tw|
|
||||||
|
tw.delete(:entities)
|
||||||
|
} + @tweets
|
||||||
|
File.open(@path, 'w') do |f|
|
||||||
|
f.write(JSON.pretty_generate(@tweets))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
|
@ -1,82 +0,0 @@
|
||||||
#!/usr/bin/env ruby
|
|
||||||
# encoding: utf-8
|
|
||||||
|
|
||||||
require 'twitter'
|
|
||||||
|
|
||||||
module Ebooks
|
|
||||||
class Archiver
|
|
||||||
def initialize(username, outpath)
|
|
||||||
@username = username
|
|
||||||
@outpath = outpath
|
|
||||||
@client = Twitter::Client.new
|
|
||||||
end
|
|
||||||
|
|
||||||
# Read exiting corpus into memory.
|
|
||||||
# Return list of tweet lines and the last tweet id.
|
|
||||||
def read_corpus
|
|
||||||
lines = []
|
|
||||||
since_id = nil
|
|
||||||
|
|
||||||
if File.exists?(@outpath)
|
|
||||||
lines = File.read(@outpath).split("\n")
|
|
||||||
if lines[0].start_with?('#')
|
|
||||||
since_id = lines[0].split('# ').last
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
[lines, since_id]
|
|
||||||
end
|
|
||||||
|
|
||||||
# Retrieve all available tweets for a given user since the last tweet id
|
|
||||||
def tweets_since(since_id)
|
|
||||||
page = 1
|
|
||||||
retries = 0
|
|
||||||
tweets = []
|
|
||||||
max_id = nil
|
|
||||||
|
|
||||||
opts = {
|
|
||||||
count: 200,
|
|
||||||
include_rts: false,
|
|
||||||
trim_user: true
|
|
||||||
}
|
|
||||||
|
|
||||||
opts[:since_id] = since_id unless since_id.nil?
|
|
||||||
|
|
||||||
loop do
|
|
||||||
opts[:max_id] = max_id unless max_id.nil?
|
|
||||||
new = @client.user_timeline(@username, opts)
|
|
||||||
break if new.length <= 1
|
|
||||||
puts "Received #{new.length} tweets"
|
|
||||||
tweets += new
|
|
||||||
max_id = new.last.id
|
|
||||||
break
|
|
||||||
end
|
|
||||||
|
|
||||||
tweets
|
|
||||||
end
|
|
||||||
|
|
||||||
def fetch_tweets
|
|
||||||
lines, since_id = read_corpus
|
|
||||||
|
|
||||||
if since_id.nil?
|
|
||||||
puts "Retrieving tweets from @#{@username}"
|
|
||||||
else
|
|
||||||
puts "Retrieving tweets from @#{@username} since #{since_id}"
|
|
||||||
end
|
|
||||||
|
|
||||||
tweets = tweets_since(since_id)
|
|
||||||
|
|
||||||
if tweets.length == 0
|
|
||||||
puts "No new tweets"
|
|
||||||
return
|
|
||||||
end
|
|
||||||
|
|
||||||
new_lines = tweets.map { |tweet| tweet.text.gsub("\n", " ") }
|
|
||||||
new_since_id = tweets[0].id.to_s
|
|
||||||
lines = ["# " + new_since_id] + new_lines + lines
|
|
||||||
corpus = File.open(@outpath, 'w')
|
|
||||||
corpus.write(lines.join("\n"))
|
|
||||||
corpus.close
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
|
@ -1,3 +1,3 @@
|
||||||
module Ebooks
|
module Ebooks
|
||||||
VERSION = "2.1.2"
|
VERSION = "2.1.3"
|
||||||
end
|
end
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue