consume multiple corpuses

2014-10-29 18:56:37 +01:00 · 2014-10-29 18:56:37 +01:00 · 2698963fb1
commit 2698963fb1
parent 9731575a3d
2 changed files with 66 additions and 0 deletions
--- a/bin/ebooks
+++ b/bin/ebooks
@ -62,6 +62,32 @@ STR
    end
  end
  def self.consume_all(name, paths)
    usage = <<STR
 Usage: ebooks consume-all <name> <corpus_path> [corpus_path2] [...]
 Processes some number of text files or json tweet corpuses
 into one usable model. It will be output at model/<name>.model
 STR
    if paths.empty?
      log usage
      exit
    end
    outpath = File.join(APP_PATH, 'model', "#{name}.model")
    #pathes.each do |path|
    #  filename = File.basename(path)
    #  shortname = filename.split('.')[0..-2].join('.')
    #
    #  outpath = File.join(APP_PATH, 'model', "#{shortname}.model")
    #  Model.consume(path).save(outpath)
    #  log "Corpus consumed to #{outpath}"
    #end
    Model.consume_all(paths).save(outpath)
    log "Corpuses consumed to #{outpath}"
  end
  def self.gen(model_path, input)
    usage = <<STR
 Usage: ebooks gen <model_path> [input]
@ -187,6 +213,7 @@ STR
 Usage:
     ebooks new <reponame>
     ebooks consume <corpus_path> [corpus_path2] [...]
     ebooks consume-all <corpus_path> [corpus_path2] [...]
     ebooks gen <model_path> [input]
     ebooks score <model_path> <input>
     ebooks archive <@user> <outpath>
@ -202,6 +229,7 @@ STR
    case args[0]
    when "new" then new(args[1])
    when "consume" then consume(args[1..-1])
    when "consume-all" then consume_all(args[1], args[2..-1])
    when "gen" then gen(args[1], args[2..-1].join(' '))
    when "score" then score(args[1], args[2..-1].join(' '))
    when "archive" then archive(args[1], args[2])
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -14,6 +14,10 @@ module Ebooks
      Model.new.consume(txtpath)
    end
    def self.consume_all(paths)
      Model.new.consume_all(paths)
    end
    def self.load(path)
      model = Model.new
      model.instance_eval do
@ -87,6 +91,10 @@ module Ebooks
        lines = content.split("\n")
      end
      consume_lines(lines)
    end
    def consume_lines(lines)
      log "Removing commented lines and sorting mentions"
      statements = []
@ -118,6 +126,36 @@ module Ebooks
      self
    end
    def consume_all(paths)
      lines = []
      paths.each do |path|
        content = File.read(path, :encoding => 'utf-8')
        @hash = Digest::MD5.hexdigest(content)
        if path.split('.')[-1] == "json"
          log "Reading json corpus from #{path}"
          l = JSON.parse(content).map do |tweet|
            tweet['text']
          end
          lines.concat(l)
        elsif path.split('.')[-1] == "csv"
          log "Reading CSV corpus from #{path}"
          content = CSV.parse(content)
          header = content.shift
          text_col = header.index('text')
          l = content.map do |tweet|
            tweet[text_col]
          end
          lines.concat(l)
        else
          log "Reading plaintext corpus from #{path}"
          l = content.split("\n")
          lines.concat(l)
        end
      end
      consume_lines(lines)
    end
    def fix(tweet)
      # This seems to require an external api call
      #begin