Ran into `Encoding::CompatibilityError` issue trying to consume my corpus (tweets.csv) on Windows 7, but this likely affects other environments as well. Fix: force reading corpus file contents as utf-8. Also a quick clean-up of the CSV flow to only parse the content once instead of double-dipping.
		
			
				
	
	
		
			190 lines
		
	
	
	
		
			5 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			190 lines
		
	
	
	
		
			5 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
#!/usr/bin/env ruby
 | 
						|
# encoding: utf-8
 | 
						|
 | 
						|
require 'json'
 | 
						|
require 'set'
 | 
						|
require 'digest/md5'
 | 
						|
require 'csv'
 | 
						|
 | 
						|
module Ebooks
 | 
						|
  class Model
 | 
						|
    attr_accessor :hash, :sentences, :mentions, :keywords
 | 
						|
 | 
						|
    def self.consume(txtpath)
 | 
						|
      Model.new.consume(txtpath)
 | 
						|
    end
 | 
						|
 | 
						|
    def self.load(path)
 | 
						|
      Marshal.load(File.open(path, 'rb') { |f| f.read })
 | 
						|
    end
 | 
						|
 | 
						|
    def consume(path)
 | 
						|
      content = File.read(path, :encoding => 'utf-8')
 | 
						|
      @hash = Digest::MD5.hexdigest(content)
 | 
						|
 | 
						|
      if path.split('.')[-1] == "json"
 | 
						|
        log "Reading json corpus from #{path}"
 | 
						|
        lines = JSON.parse(content, symbolize_names: true).map do |tweet|
 | 
						|
          tweet[:text]
 | 
						|
        end
 | 
						|
      elsif path.split('.')[-1] == "csv"
 | 
						|
        log "Reading CSV corpus from #{path}"
 | 
						|
        content = CSV.parse(content)
 | 
						|
        header = content.shift
 | 
						|
        text_col = header.index('text')
 | 
						|
        lines = content.map do |tweet|
 | 
						|
          tweet[text_col]
 | 
						|
        end
 | 
						|
      else
 | 
						|
        log "Reading plaintext corpus from #{path}"
 | 
						|
        lines = content.split("\n")
 | 
						|
      end
 | 
						|
 | 
						|
      log "Removing commented lines and sorting mentions"
 | 
						|
 | 
						|
      keeping = []
 | 
						|
      mentions = []
 | 
						|
      lines.each do |l|
 | 
						|
        next if l.start_with?('#') # Remove commented lines
 | 
						|
        next if l.include?('RT') || l.include?('MT') # Remove soft retweets
 | 
						|
        
 | 
						|
        if l.include?('@')
 | 
						|
          mentions << l
 | 
						|
        else
 | 
						|
          keeping << l
 | 
						|
        end
 | 
						|
      end
 | 
						|
      text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
 | 
						|
      mention_text = NLP.normalize(mentions.join("\n"))
 | 
						|
 | 
						|
      log "Segmenting text into sentences"
 | 
						|
 | 
						|
      statements = NLP.sentences(text)
 | 
						|
      mentions = NLP.sentences(mention_text)
 | 
						|
 | 
						|
      log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
 | 
						|
      @sentences = []
 | 
						|
      @mentions = []
 | 
						|
 | 
						|
      statements.each do |s|
 | 
						|
        @sentences << NLP.tokenize(s).reject do |t|
 | 
						|
          t.include?('@') || t.include?('http')
 | 
						|
        end
 | 
						|
      end
 | 
						|
 | 
						|
      mentions.each do |s|
 | 
						|
        @mentions << NLP.tokenize(s).reject do |t|
 | 
						|
          t.include?('@') || t.include?('http')
 | 
						|
        end
 | 
						|
      end
 | 
						|
 | 
						|
      log "Ranking keywords"
 | 
						|
      @keywords = NLP.keywords(@sentences)
 | 
						|
 | 
						|
      self
 | 
						|
    end
 | 
						|
 | 
						|
    def save(path)
 | 
						|
      File.open(path, 'wb') do |f|
 | 
						|
        f.write(Marshal.dump(self))
 | 
						|
      end
 | 
						|
      self
 | 
						|
    end
 | 
						|
 | 
						|
    def fix(tweet)
 | 
						|
      # This seems to require an external api call
 | 
						|
      #begin
 | 
						|
      #  fixer = NLP.gingerice.parse(tweet)
 | 
						|
      #  log fixer if fixer['corrections']
 | 
						|
      #  tweet = fixer['result']
 | 
						|
      #rescue Exception => e
 | 
						|
      #  log e.message
 | 
						|
      #  log e.backtrace
 | 
						|
      #end
 | 
						|
 | 
						|
      NLP.htmlentities.decode tweet
 | 
						|
    end
 | 
						|
 | 
						|
    def valid_tweet?(tokens, limit)
 | 
						|
      tweet = NLP.reconstruct(tokens)
 | 
						|
      tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
 | 
						|
    end
 | 
						|
 | 
						|
    def make_statement(limit=140, generator=nil, retry_limit=10)
 | 
						|
      responding = !generator.nil?
 | 
						|
      generator ||= SuffixGenerator.build(@sentences)
 | 
						|
 | 
						|
      retries = 0
 | 
						|
      tweet = ""
 | 
						|
 | 
						|
      while (tokens = generator.generate(3, :bigrams)) do
 | 
						|
        next if tokens.length <= 3 && !responding
 | 
						|
        break if valid_tweet?(tokens, limit)
 | 
						|
 | 
						|
        retries += 1
 | 
						|
        break if retries >= retry_limit
 | 
						|
      end
 | 
						|
 | 
						|
      if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
 | 
						|
        while (tokens = generator.generate(3, :unigrams)) do
 | 
						|
          break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
 | 
						|
 | 
						|
          retries += 1
 | 
						|
          break if retries >= retry_limit
 | 
						|
        end
 | 
						|
      end
 | 
						|
 | 
						|
      tweet = NLP.reconstruct(tokens)
 | 
						|
 | 
						|
      if retries >= retry_limit
 | 
						|
        log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
 | 
						|
      end
 | 
						|
 | 
						|
      fix tweet
 | 
						|
    end
 | 
						|
 | 
						|
    # Test if a sentence has been copied verbatim from original
 | 
						|
    def verbatim?(tokens)
 | 
						|
      @sentences.include?(tokens) || @mentions.include?(tokens)
 | 
						|
    end
 | 
						|
 | 
						|
    # Finds all relevant tokenized sentences to given input by
 | 
						|
    # comparing non-stopword token overlaps
 | 
						|
    def find_relevant(sentences, input)
 | 
						|
      relevant = []
 | 
						|
      slightly_relevant = []
 | 
						|
 | 
						|
      tokenized = NLP.tokenize(input).map(&:downcase)
 | 
						|
 | 
						|
      sentences.each do |sent|
 | 
						|
        tokenized.each do |token|
 | 
						|
          if sent.map(&:downcase).include?(token)
 | 
						|
            relevant << sent unless NLP.stopword?(token)
 | 
						|
            slightly_relevant << sent
 | 
						|
          end
 | 
						|
        end
 | 
						|
      end
 | 
						|
 | 
						|
      [relevant, slightly_relevant]
 | 
						|
    end
 | 
						|
 | 
						|
    # Generates a response by looking for related sentences
 | 
						|
    # in the corpus and building a smaller generator from these
 | 
						|
    def make_response(input, limit=140, sentences=@mentions)
 | 
						|
      # Prefer mentions
 | 
						|
      relevant, slightly_relevant = find_relevant(sentences, input)
 | 
						|
 | 
						|
      if relevant.length >= 3
 | 
						|
        generator = SuffixGenerator.build(relevant)
 | 
						|
        make_statement(limit, generator)
 | 
						|
      elsif slightly_relevant.length >= 5
 | 
						|
        generator = SuffixGenerator.build(slightly_relevant)
 | 
						|
        make_statement(limit, generator)
 | 
						|
      elsif sentences.equal?(@mentions)
 | 
						|
        make_response(input, limit, @sentences)
 | 
						|
      else
 | 
						|
        make_statement(limit)
 | 
						|
      end
 | 
						|
    end
 | 
						|
  end
 | 
						|
end
 |