consume multiple corpuses
This commit is contained in:
		
							parent
							
								
									9731575a3d
								
							
						
					
					
						commit
						2698963fb1
					
				
					 2 changed files with 66 additions and 0 deletions
				
			
		| 
						 | 
				
			
			@ -14,6 +14,10 @@ module Ebooks
 | 
			
		|||
      Model.new.consume(txtpath)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.consume_all(paths)
 | 
			
		||||
      Model.new.consume_all(paths)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.load(path)
 | 
			
		||||
      model = Model.new
 | 
			
		||||
      model.instance_eval do
 | 
			
		||||
| 
						 | 
				
			
			@ -87,6 +91,10 @@ module Ebooks
 | 
			
		|||
        lines = content.split("\n")
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      consume_lines(lines)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def consume_lines(lines)
 | 
			
		||||
      log "Removing commented lines and sorting mentions"
 | 
			
		||||
 | 
			
		||||
      statements = []
 | 
			
		||||
| 
						 | 
				
			
			@ -118,6 +126,36 @@ module Ebooks
 | 
			
		|||
      self
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def consume_all(paths)
 | 
			
		||||
      lines = []
 | 
			
		||||
      paths.each do |path|
 | 
			
		||||
        content = File.read(path, :encoding => 'utf-8')
 | 
			
		||||
        @hash = Digest::MD5.hexdigest(content)
 | 
			
		||||
 | 
			
		||||
        if path.split('.')[-1] == "json"
 | 
			
		||||
          log "Reading json corpus from #{path}"
 | 
			
		||||
          l = JSON.parse(content).map do |tweet|
 | 
			
		||||
            tweet['text']
 | 
			
		||||
          end
 | 
			
		||||
          lines.concat(l)
 | 
			
		||||
        elsif path.split('.')[-1] == "csv"
 | 
			
		||||
          log "Reading CSV corpus from #{path}"
 | 
			
		||||
          content = CSV.parse(content)
 | 
			
		||||
          header = content.shift
 | 
			
		||||
          text_col = header.index('text')
 | 
			
		||||
          l = content.map do |tweet|
 | 
			
		||||
            tweet[text_col]
 | 
			
		||||
          end
 | 
			
		||||
          lines.concat(l)
 | 
			
		||||
        else
 | 
			
		||||
          log "Reading plaintext corpus from #{path}"
 | 
			
		||||
          l = content.split("\n")
 | 
			
		||||
          lines.concat(l)
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
      consume_lines(lines)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def fix(tweet)
 | 
			
		||||
      # This seems to require an external api call
 | 
			
		||||
      #begin
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue