Allow consumption of json archives

This commit is contained in:
Mispy 2013-11-27 05:12:54 -08:00
parent acc2f42b38
commit 306c9ab873
2 changed files with 35 additions and 25 deletions

View file

@ -73,28 +73,30 @@ module Ebooks
bot.tweet(statement)
end
def self.jsonify(old_path, new_path)
name = File.basename(old_path).split('.')[0]
new_path ||= name + ".json"
def self.jsonify(paths)
paths.each do |path|
name = File.basename(path).split('.')[0]
new_path = name + ".json"
tweets = []
id = nil
File.read(old_path).split("\n").each do |l|
if l.start_with?('# ')
id = l.split('# ')[-1]
else
tweet = { text: l }
if id
tweet[:id] = id
id = nil
tweets = []
id = nil
File.read(path).split("\n").each do |l|
if l.start_with?('# ')
id = l.split('# ')[-1]
else
tweet = { text: l }
if id
tweet[:id] = id
id = nil
end
tweets << tweet
end
tweets << tweet
end
end
File.open(new_path, 'w') do |f|
log "Writing #{tweets.length} tweets to #{new_path}"
f.write(JSON.pretty_generate(tweets))
File.open(new_path, 'w') do |f|
log "Writing #{tweets.length} tweets to #{new_path}"
f.write(JSON.pretty_generate(tweets))
end
end
end
@ -106,7 +108,7 @@ module Ebooks
ebooks score <model_path> <input>
ebooks archive <@user> <outpath>
ebooks tweet <model_path> <@bot>
ebooks jsonify <old_corpus_path> [new_corpus_path]
ebooks jsonify <old_corpus_path> [...]
"""
if args.length == 0
@ -121,7 +123,7 @@ module Ebooks
when "score" then score(args[1], args[2..-1].join(' '))
when "archive" then archive(args[1], args[2])
when "tweet" then tweet(args[1], args[2])
when "jsonify" then jsonify(args[1], args[2])
when "jsonify" then jsonify(args[1..-1])
end
end
end

View file

@ -17,14 +17,22 @@ module Ebooks
Marshal.load(File.read(path))
end
def consume(txtpath)
# Record hash of source file so we know to update later
@hash = Digest::MD5.hexdigest(File.read(txtpath))
def consume(path)
content = File.read(path)
@hash = Digest::MD5.hexdigest(content)
if path.split('.')[-1] == "json"
log "Reading json corpus from #{path}"
lines = JSON.parse(content, symbolize_names: true).map do |tweet|
tweet[:text]
end
else
log "Reading plaintext corpus from #{path}"
lines = content.split("\n")
end
text = File.read(txtpath)
log "Removing commented lines and sorting mentions"
lines = text.split("\n")
keeping = []
mentions = []
lines.each do |l|