diff --git a/bin/ebooks b/bin/ebooks index e1dbbad..a8cdbba 100755 --- a/bin/ebooks +++ b/bin/ebooks @@ -62,6 +62,32 @@ STR end end + def self.consume_all(name, paths) + usage = < [corpus_path2] [...] + +Processes some number of text files or json tweet corpuses +into one usable model. It will be output at model/.model +STR + + if paths.empty? + log usage + exit + end + + outpath = File.join(APP_PATH, 'model', "#{name}.model") + #pathes.each do |path| + # filename = File.basename(path) + # shortname = filename.split('.')[0..-2].join('.') + # + # outpath = File.join(APP_PATH, 'model', "#{shortname}.model") + # Model.consume(path).save(outpath) + # log "Corpus consumed to #{outpath}" + #end + Model.consume_all(paths).save(outpath) + log "Corpuses consumed to #{outpath}" + end + def self.gen(model_path, input) usage = < [input] @@ -187,6 +213,7 @@ STR Usage: ebooks new ebooks consume [corpus_path2] [...] + ebooks consume-all [corpus_path2] [...] ebooks gen [input] ebooks score ebooks archive <@user> @@ -202,6 +229,7 @@ STR case args[0] when "new" then new(args[1]) when "consume" then consume(args[1..-1]) + when "consume-all" then consume_all(args[1], args[2..-1]) when "gen" then gen(args[1], args[2..-1].join(' ')) when "score" then score(args[1], args[2..-1].join(' ')) when "archive" then archive(args[1], args[2]) diff --git a/lib/twitter_ebooks/model.rb b/lib/twitter_ebooks/model.rb index f3139e0..0f1bbad 100644 --- a/lib/twitter_ebooks/model.rb +++ b/lib/twitter_ebooks/model.rb @@ -14,6 +14,10 @@ module Ebooks Model.new.consume(txtpath) end + def self.consume_all(paths) + Model.new.consume_all(paths) + end + def self.load(path) model = Model.new model.instance_eval do @@ -87,6 +91,10 @@ module Ebooks lines = content.split("\n") end + consume_lines(lines) + end + + def consume_lines(lines) log "Removing commented lines and sorting mentions" statements = [] @@ -118,6 +126,36 @@ module Ebooks self end + def consume_all(paths) + lines = [] + paths.each do |path| + content = File.read(path, :encoding => 'utf-8') + @hash = Digest::MD5.hexdigest(content) + + if path.split('.')[-1] == "json" + log "Reading json corpus from #{path}" + l = JSON.parse(content).map do |tweet| + tweet['text'] + end + lines.concat(l) + elsif path.split('.')[-1] == "csv" + log "Reading CSV corpus from #{path}" + content = CSV.parse(content) + header = content.shift + text_col = header.index('text') + l = content.map do |tweet| + tweet[text_col] + end + lines.concat(l) + else + log "Reading plaintext corpus from #{path}" + l = content.split("\n") + lines.concat(l) + end + end + consume_lines(lines) + end + def fix(tweet) # This seems to require an external api call #begin