Allow consumption of json archives
This commit is contained in:
parent
acc2f42b38
commit
306c9ab873
2 changed files with 35 additions and 25 deletions
42
bin/ebooks
42
bin/ebooks
|
@ -73,28 +73,30 @@ module Ebooks
|
||||||
bot.tweet(statement)
|
bot.tweet(statement)
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.jsonify(old_path, new_path)
|
def self.jsonify(paths)
|
||||||
name = File.basename(old_path).split('.')[0]
|
paths.each do |path|
|
||||||
new_path ||= name + ".json"
|
name = File.basename(path).split('.')[0]
|
||||||
|
new_path = name + ".json"
|
||||||
|
|
||||||
tweets = []
|
tweets = []
|
||||||
id = nil
|
id = nil
|
||||||
File.read(old_path).split("\n").each do |l|
|
File.read(path).split("\n").each do |l|
|
||||||
if l.start_with?('# ')
|
if l.start_with?('# ')
|
||||||
id = l.split('# ')[-1]
|
id = l.split('# ')[-1]
|
||||||
else
|
else
|
||||||
tweet = { text: l }
|
tweet = { text: l }
|
||||||
if id
|
if id
|
||||||
tweet[:id] = id
|
tweet[:id] = id
|
||||||
id = nil
|
id = nil
|
||||||
|
end
|
||||||
|
tweets << tweet
|
||||||
end
|
end
|
||||||
tweets << tweet
|
|
||||||
end
|
end
|
||||||
end
|
|
||||||
|
|
||||||
File.open(new_path, 'w') do |f|
|
File.open(new_path, 'w') do |f|
|
||||||
log "Writing #{tweets.length} tweets to #{new_path}"
|
log "Writing #{tweets.length} tweets to #{new_path}"
|
||||||
f.write(JSON.pretty_generate(tweets))
|
f.write(JSON.pretty_generate(tweets))
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -106,7 +108,7 @@ module Ebooks
|
||||||
ebooks score <model_path> <input>
|
ebooks score <model_path> <input>
|
||||||
ebooks archive <@user> <outpath>
|
ebooks archive <@user> <outpath>
|
||||||
ebooks tweet <model_path> <@bot>
|
ebooks tweet <model_path> <@bot>
|
||||||
ebooks jsonify <old_corpus_path> [new_corpus_path]
|
ebooks jsonify <old_corpus_path> [...]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if args.length == 0
|
if args.length == 0
|
||||||
|
@ -121,7 +123,7 @@ module Ebooks
|
||||||
when "score" then score(args[1], args[2..-1].join(' '))
|
when "score" then score(args[1], args[2..-1].join(' '))
|
||||||
when "archive" then archive(args[1], args[2])
|
when "archive" then archive(args[1], args[2])
|
||||||
when "tweet" then tweet(args[1], args[2])
|
when "tweet" then tweet(args[1], args[2])
|
||||||
when "jsonify" then jsonify(args[1], args[2])
|
when "jsonify" then jsonify(args[1..-1])
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -17,14 +17,22 @@ module Ebooks
|
||||||
Marshal.load(File.read(path))
|
Marshal.load(File.read(path))
|
||||||
end
|
end
|
||||||
|
|
||||||
def consume(txtpath)
|
def consume(path)
|
||||||
# Record hash of source file so we know to update later
|
content = File.read(path)
|
||||||
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
@hash = Digest::MD5.hexdigest(content)
|
||||||
|
|
||||||
|
if path.split('.')[-1] == "json"
|
||||||
|
log "Reading json corpus from #{path}"
|
||||||
|
lines = JSON.parse(content, symbolize_names: true).map do |tweet|
|
||||||
|
tweet[:text]
|
||||||
|
end
|
||||||
|
else
|
||||||
|
log "Reading plaintext corpus from #{path}"
|
||||||
|
lines = content.split("\n")
|
||||||
|
end
|
||||||
|
|
||||||
text = File.read(txtpath)
|
|
||||||
log "Removing commented lines and sorting mentions"
|
log "Removing commented lines and sorting mentions"
|
||||||
|
|
||||||
lines = text.split("\n")
|
|
||||||
keeping = []
|
keeping = []
|
||||||
mentions = []
|
mentions = []
|
||||||
lines.each do |l|
|
lines.each do |l|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue