Allow consumption of json archives
This commit is contained in:
parent
acc2f42b38
commit
306c9ab873
2 changed files with 35 additions and 25 deletions
|
@ -17,14 +17,22 @@ module Ebooks
|
|||
Marshal.load(File.read(path))
|
||||
end
|
||||
|
||||
def consume(txtpath)
|
||||
# Record hash of source file so we know to update later
|
||||
@hash = Digest::MD5.hexdigest(File.read(txtpath))
|
||||
def consume(path)
|
||||
content = File.read(path)
|
||||
@hash = Digest::MD5.hexdigest(content)
|
||||
|
||||
if path.split('.')[-1] == "json"
|
||||
log "Reading json corpus from #{path}"
|
||||
lines = JSON.parse(content, symbolize_names: true).map do |tweet|
|
||||
tweet[:text]
|
||||
end
|
||||
else
|
||||
log "Reading plaintext corpus from #{path}"
|
||||
lines = content.split("\n")
|
||||
end
|
||||
|
||||
text = File.read(txtpath)
|
||||
log "Removing commented lines and sorting mentions"
|
||||
|
||||
lines = text.split("\n")
|
||||
keeping = []
|
||||
mentions = []
|
||||
lines.each do |l|
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue