consume multiple corpuses

2014-10-29 18:56:37 +01:00 · 2014-10-29 18:56:37 +01:00 · 2698963fb1
commit 2698963fb1
parent 9731575a3d
2 changed files with 66 additions and 0 deletions
--- a/bin/ebooks
+++ b/bin/ebooks
@ -62,6 +62,32 @@ STR
    end
  end

+  def self.consume_all(name, paths)
+    usage = <<STR
+Usage: ebooks consume-all <name> <corpus_path> [corpus_path2] [...]
+
+Processes some number of text files or json tweet corpuses
+into one usable model. It will be output at model/<name>.model
+STR
+
+    if paths.empty?
+      log usage
+      exit
+    end
+
+    outpath = File.join(APP_PATH, 'model', "#{name}.model")
+    #pathes.each do |path|
+    #  filename = File.basename(path)
+    #  shortname = filename.split('.')[0..-2].join('.')
+    #
+    #  outpath = File.join(APP_PATH, 'model', "#{shortname}.model")
+    #  Model.consume(path).save(outpath)
+    #  log "Corpus consumed to #{outpath}"
+    #end
+    Model.consume_all(paths).save(outpath)
+    log "Corpuses consumed to #{outpath}"
+  end
+
  def self.gen(model_path, input)
    usage = <<STR
 Usage: ebooks gen <model_path> [input]
@ -187,6 +213,7 @@ STR
 Usage:
     ebooks new <reponame>
     ebooks consume <corpus_path> [corpus_path2] [...]
+     ebooks consume-all <corpus_path> [corpus_path2] [...]
     ebooks gen <model_path> [input]
     ebooks score <model_path> <input>
     ebooks archive <@user> <outpath>
@ -202,6 +229,7 @@ STR
    case args[0]
    when "new" then new(args[1])
    when "consume" then consume(args[1..-1])
+    when "consume-all" then consume_all(args[1], args[2..-1])
    when "gen" then gen(args[1], args[2..-1].join(' '))
    when "score" then score(args[1], args[2..-1].join(' '))
    when "archive" then archive(args[1], args[2])
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -14,6 +14,10 @@ module Ebooks
      Model.new.consume(txtpath)
    end

+    def self.consume_all(paths)
+      Model.new.consume_all(paths)
+    end
+
    def self.load(path)
      model = Model.new
      model.instance_eval do
@ -87,6 +91,10 @@ module Ebooks
        lines = content.split("\n")
      end

+      consume_lines(lines)
+    end
+
+    def consume_lines(lines)
      log "Removing commented lines and sorting mentions"

      statements = []
@ -118,6 +126,36 @@ module Ebooks
      self
    end

+    def consume_all(paths)
+      lines = []
+      paths.each do |path|
+        content = File.read(path, :encoding => 'utf-8')
+        @hash = Digest::MD5.hexdigest(content)
+
+        if path.split('.')[-1] == "json"
+          log "Reading json corpus from #{path}"
+          l = JSON.parse(content).map do |tweet|
+            tweet['text']
+          end
+          lines.concat(l)
+        elsif path.split('.')[-1] == "csv"
+          log "Reading CSV corpus from #{path}"
+          content = CSV.parse(content)
+          header = content.shift
+          text_col = header.index('text')
+          l = content.map do |tweet|
+            tweet[text_col]
+          end
+          lines.concat(l)
+        else
+          log "Reading plaintext corpus from #{path}"
+          l = content.split("\n")
+          lines.concat(l)
+        end
+      end
+      consume_lines(lines)
+    end
+
    def fix(tweet)
      # This seems to require an external api call
      #begin