consume multiple corpuses
This commit is contained in:
parent
9731575a3d
commit
2698963fb1
2 changed files with 66 additions and 0 deletions
|
@ -14,6 +14,10 @@ module Ebooks
|
|||
Model.new.consume(txtpath)
|
||||
end
|
||||
|
||||
def self.consume_all(paths)
|
||||
Model.new.consume_all(paths)
|
||||
end
|
||||
|
||||
def self.load(path)
|
||||
model = Model.new
|
||||
model.instance_eval do
|
||||
|
@ -87,6 +91,10 @@ module Ebooks
|
|||
lines = content.split("\n")
|
||||
end
|
||||
|
||||
consume_lines(lines)
|
||||
end
|
||||
|
||||
def consume_lines(lines)
|
||||
log "Removing commented lines and sorting mentions"
|
||||
|
||||
statements = []
|
||||
|
@ -118,6 +126,36 @@ module Ebooks
|
|||
self
|
||||
end
|
||||
|
||||
def consume_all(paths)
|
||||
lines = []
|
||||
paths.each do |path|
|
||||
content = File.read(path, :encoding => 'utf-8')
|
||||
@hash = Digest::MD5.hexdigest(content)
|
||||
|
||||
if path.split('.')[-1] == "json"
|
||||
log "Reading json corpus from #{path}"
|
||||
l = JSON.parse(content).map do |tweet|
|
||||
tweet['text']
|
||||
end
|
||||
lines.concat(l)
|
||||
elsif path.split('.')[-1] == "csv"
|
||||
log "Reading CSV corpus from #{path}"
|
||||
content = CSV.parse(content)
|
||||
header = content.shift
|
||||
text_col = header.index('text')
|
||||
l = content.map do |tweet|
|
||||
tweet[text_col]
|
||||
end
|
||||
lines.concat(l)
|
||||
else
|
||||
log "Reading plaintext corpus from #{path}"
|
||||
l = content.split("\n")
|
||||
lines.concat(l)
|
||||
end
|
||||
end
|
||||
consume_lines(lines)
|
||||
end
|
||||
|
||||
def fix(tweet)
|
||||
# This seems to require an external api call
|
||||
#begin
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue