Allow consumption of json archives

This commit is contained in:
Mispy 2013-11-27 05:12:54 -08:00
parent acc2f42b38
commit 306c9ab873
2 changed files with 35 additions and 25 deletions

View file

@ -73,28 +73,30 @@ module Ebooks
bot.tweet(statement) bot.tweet(statement)
end end
def self.jsonify(old_path, new_path) def self.jsonify(paths)
name = File.basename(old_path).split('.')[0] paths.each do |path|
new_path ||= name + ".json" name = File.basename(path).split('.')[0]
new_path = name + ".json"
tweets = [] tweets = []
id = nil id = nil
File.read(old_path).split("\n").each do |l| File.read(path).split("\n").each do |l|
if l.start_with?('# ') if l.start_with?('# ')
id = l.split('# ')[-1] id = l.split('# ')[-1]
else else
tweet = { text: l } tweet = { text: l }
if id if id
tweet[:id] = id tweet[:id] = id
id = nil id = nil
end
tweets << tweet
end end
tweets << tweet
end end
end
File.open(new_path, 'w') do |f| File.open(new_path, 'w') do |f|
log "Writing #{tweets.length} tweets to #{new_path}" log "Writing #{tweets.length} tweets to #{new_path}"
f.write(JSON.pretty_generate(tweets)) f.write(JSON.pretty_generate(tweets))
end
end end
end end
@ -106,7 +108,7 @@ module Ebooks
ebooks score <model_path> <input> ebooks score <model_path> <input>
ebooks archive <@user> <outpath> ebooks archive <@user> <outpath>
ebooks tweet <model_path> <@bot> ebooks tweet <model_path> <@bot>
ebooks jsonify <old_corpus_path> [new_corpus_path] ebooks jsonify <old_corpus_path> [...]
""" """
if args.length == 0 if args.length == 0
@ -121,7 +123,7 @@ module Ebooks
when "score" then score(args[1], args[2..-1].join(' ')) when "score" then score(args[1], args[2..-1].join(' '))
when "archive" then archive(args[1], args[2]) when "archive" then archive(args[1], args[2])
when "tweet" then tweet(args[1], args[2]) when "tweet" then tweet(args[1], args[2])
when "jsonify" then jsonify(args[1], args[2]) when "jsonify" then jsonify(args[1..-1])
end end
end end
end end

View file

@ -17,14 +17,22 @@ module Ebooks
Marshal.load(File.read(path)) Marshal.load(File.read(path))
end end
def consume(txtpath) def consume(path)
# Record hash of source file so we know to update later content = File.read(path)
@hash = Digest::MD5.hexdigest(File.read(txtpath)) @hash = Digest::MD5.hexdigest(content)
if path.split('.')[-1] == "json"
log "Reading json corpus from #{path}"
lines = JSON.parse(content, symbolize_names: true).map do |tweet|
tweet[:text]
end
else
log "Reading plaintext corpus from #{path}"
lines = content.split("\n")
end
text = File.read(txtpath)
log "Removing commented lines and sorting mentions" log "Removing commented lines and sorting mentions"
lines = text.split("\n")
keeping = [] keeping = []
mentions = [] mentions = []
lines.each do |l| lines.each do |l|