consume multiple corpuses
This commit is contained in:
		
							parent
							
								
									9731575a3d
								
							
						
					
					
						commit
						2698963fb1
					
				
					 2 changed files with 66 additions and 0 deletions
				
			
		
							
								
								
									
										28
									
								
								bin/ebooks
									
										
									
									
									
								
							
							
						
						
									
										28
									
								
								bin/ebooks
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -62,6 +62,32 @@ STR
 | 
			
		|||
    end
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def self.consume_all(name, paths)
 | 
			
		||||
    usage = <<STR
 | 
			
		||||
Usage: ebooks consume-all <name> <corpus_path> [corpus_path2] [...]
 | 
			
		||||
 | 
			
		||||
Processes some number of text files or json tweet corpuses
 | 
			
		||||
into one usable model. It will be output at model/<name>.model
 | 
			
		||||
STR
 | 
			
		||||
 | 
			
		||||
    if paths.empty?
 | 
			
		||||
      log usage
 | 
			
		||||
      exit
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    outpath = File.join(APP_PATH, 'model', "#{name}.model")
 | 
			
		||||
    #pathes.each do |path|
 | 
			
		||||
    #  filename = File.basename(path)
 | 
			
		||||
    #  shortname = filename.split('.')[0..-2].join('.')
 | 
			
		||||
    #
 | 
			
		||||
    #  outpath = File.join(APP_PATH, 'model', "#{shortname}.model")
 | 
			
		||||
    #  Model.consume(path).save(outpath)
 | 
			
		||||
    #  log "Corpus consumed to #{outpath}"
 | 
			
		||||
    #end
 | 
			
		||||
    Model.consume_all(paths).save(outpath)
 | 
			
		||||
    log "Corpuses consumed to #{outpath}"
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def self.gen(model_path, input)
 | 
			
		||||
    usage = <<STR
 | 
			
		||||
Usage: ebooks gen <model_path> [input]
 | 
			
		||||
| 
						 | 
				
			
			@ -187,6 +213,7 @@ STR
 | 
			
		|||
Usage:
 | 
			
		||||
     ebooks new <reponame>
 | 
			
		||||
     ebooks consume <corpus_path> [corpus_path2] [...]
 | 
			
		||||
     ebooks consume-all <corpus_path> [corpus_path2] [...]
 | 
			
		||||
     ebooks gen <model_path> [input]
 | 
			
		||||
     ebooks score <model_path> <input>
 | 
			
		||||
     ebooks archive <@user> <outpath>
 | 
			
		||||
| 
						 | 
				
			
			@ -202,6 +229,7 @@ STR
 | 
			
		|||
    case args[0]
 | 
			
		||||
    when "new" then new(args[1])
 | 
			
		||||
    when "consume" then consume(args[1..-1])
 | 
			
		||||
    when "consume-all" then consume_all(args[1], args[2..-1])
 | 
			
		||||
    when "gen" then gen(args[1], args[2..-1].join(' '))
 | 
			
		||||
    when "score" then score(args[1], args[2..-1].join(' '))
 | 
			
		||||
    when "archive" then archive(args[1], args[2])
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,6 +14,10 @@ module Ebooks
 | 
			
		|||
      Model.new.consume(txtpath)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.consume_all(paths)
 | 
			
		||||
      Model.new.consume_all(paths)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.load(path)
 | 
			
		||||
      model = Model.new
 | 
			
		||||
      model.instance_eval do
 | 
			
		||||
| 
						 | 
				
			
			@ -87,6 +91,10 @@ module Ebooks
 | 
			
		|||
        lines = content.split("\n")
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      consume_lines(lines)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def consume_lines(lines)
 | 
			
		||||
      log "Removing commented lines and sorting mentions"
 | 
			
		||||
 | 
			
		||||
      statements = []
 | 
			
		||||
| 
						 | 
				
			
			@ -118,6 +126,36 @@ module Ebooks
 | 
			
		|||
      self
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def consume_all(paths)
 | 
			
		||||
      lines = []
 | 
			
		||||
      paths.each do |path|
 | 
			
		||||
        content = File.read(path, :encoding => 'utf-8')
 | 
			
		||||
        @hash = Digest::MD5.hexdigest(content)
 | 
			
		||||
 | 
			
		||||
        if path.split('.')[-1] == "json"
 | 
			
		||||
          log "Reading json corpus from #{path}"
 | 
			
		||||
          l = JSON.parse(content).map do |tweet|
 | 
			
		||||
            tweet['text']
 | 
			
		||||
          end
 | 
			
		||||
          lines.concat(l)
 | 
			
		||||
        elsif path.split('.')[-1] == "csv"
 | 
			
		||||
          log "Reading CSV corpus from #{path}"
 | 
			
		||||
          content = CSV.parse(content)
 | 
			
		||||
          header = content.shift
 | 
			
		||||
          text_col = header.index('text')
 | 
			
		||||
          l = content.map do |tweet|
 | 
			
		||||
            tweet[text_col]
 | 
			
		||||
          end
 | 
			
		||||
          lines.concat(l)
 | 
			
		||||
        else
 | 
			
		||||
          log "Reading plaintext corpus from #{path}"
 | 
			
		||||
          l = content.split("\n")
 | 
			
		||||
          lines.concat(l)
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
      consume_lines(lines)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def fix(tweet)
 | 
			
		||||
      # This seems to require an external api call
 | 
			
		||||
      #begin
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue