Retry limit and mention separation
This commit is contained in:
		
							parent
							
								
									c1d91d1693
								
							
						
					
					
						commit
						61c5caee4d
					
				
					 5 changed files with 65 additions and 21 deletions
				
			
		| 
						 | 
				
			
			@ -7,7 +7,7 @@ require 'digest/md5'
 | 
			
		|||
 | 
			
		||||
module Ebooks
 | 
			
		||||
  class Model
 | 
			
		||||
    attr_accessor :hash, :sentences, :generator, :keywords
 | 
			
		||||
    attr_accessor :hash, :sentences, :mentions, :keywords
 | 
			
		||||
 | 
			
		||||
    def self.consume(txtpath)
 | 
			
		||||
      Model.new.consume(txtpath)
 | 
			
		||||
| 
						 | 
				
			
			@ -22,23 +22,44 @@ module Ebooks
 | 
			
		|||
      @hash = Digest::MD5.hexdigest(File.read(txtpath))
 | 
			
		||||
 | 
			
		||||
      text = File.read(txtpath)
 | 
			
		||||
      log "Removing commented lines and mention tokens"
 | 
			
		||||
      log "Removing commented lines and sorting mentions"
 | 
			
		||||
 | 
			
		||||
      lines = text.split("\n")
 | 
			
		||||
      keeping = []
 | 
			
		||||
      mentions = []
 | 
			
		||||
      lines.each do |l|
 | 
			
		||||
        next if l.start_with?('#') || l.include?('RT')
 | 
			
		||||
        processed = l.split.reject { |w| w.include?('@') || w.include?('http') }
 | 
			
		||||
        keeping << processed.join(' ')
 | 
			
		||||
        next if l.start_with?('#') # Remove commented lines
 | 
			
		||||
        next if l.include?('RT') || l.include?('MT') # Remove soft retweets
 | 
			
		||||
        
 | 
			
		||||
        if l.include?('@')
 | 
			
		||||
          mentions << l
 | 
			
		||||
        else
 | 
			
		||||
          keeping << l
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
      text = NLP.normalize(keeping.join("\n"))
 | 
			
		||||
      text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
 | 
			
		||||
      mention_text = NLP.normalize(mentions.join("\n"))
 | 
			
		||||
 | 
			
		||||
      log "Segmenting text into sentences"
 | 
			
		||||
 | 
			
		||||
      sentences = NLP.sentences(text)
 | 
			
		||||
      statements = NLP.sentences(text)
 | 
			
		||||
      mentions = NLP.sentences(mention_text)
 | 
			
		||||
 | 
			
		||||
      log "Tokenizing #{sentences.length} sentences"
 | 
			
		||||
      @sentences = sentences.map { |sent| NLP.tokenize(sent) }
 | 
			
		||||
      log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
 | 
			
		||||
      @sentences = []
 | 
			
		||||
      @mentions = []
 | 
			
		||||
 | 
			
		||||
      statements.each do |s|
 | 
			
		||||
        @sentences << NLP.tokenize(s).reject do |t|
 | 
			
		||||
          t.start_with?('@') || t.start_with?('http')
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      mentions.each do |s|
 | 
			
		||||
        @mentions << NLP.tokenize(s).reject do |t|
 | 
			
		||||
          t.start_with?('@') || t.start_with?('http')
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      log "Ranking keywords"
 | 
			
		||||
      @keywords = NLP.keywords(@sentences)
 | 
			
		||||
| 
						 | 
				
			
			@ -72,38 +93,55 @@ module Ebooks
 | 
			
		|||
      tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def make_statement(limit=140, generator=nil)
 | 
			
		||||
    def make_statement(limit=140, generator=nil, retry_limit=10)
 | 
			
		||||
      responding = !generator.nil?
 | 
			
		||||
      generator ||= SuffixGenerator.build(@sentences)
 | 
			
		||||
 | 
			
		||||
      retries = 0
 | 
			
		||||
      tweet = ""
 | 
			
		||||
 | 
			
		||||
      while (tokens = generator.generate(3, :bigrams)) do
 | 
			
		||||
        next if tokens.length <= 3 && !responding
 | 
			
		||||
        break if valid_tweet?(tokens, limit)
 | 
			
		||||
 | 
			
		||||
        retries += 1
 | 
			
		||||
        break if retries >= retry_limit
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
 | 
			
		||||
      if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
 | 
			
		||||
        while (tokens = generator.generate(3, :unigrams)) do
 | 
			
		||||
          break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
 | 
			
		||||
          break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
 | 
			
		||||
 | 
			
		||||
          retries += 1
 | 
			
		||||
          break if retries >= retry_limit
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      tweet = NLP.reconstruct(tokens)
 | 
			
		||||
 | 
			
		||||
      if retries >= retry_limit
 | 
			
		||||
        log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      fix tweet
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Test if a sentence has been copied verbatim from original
 | 
			
		||||
    def verbatim?(tokens)
 | 
			
		||||
      @sentences.include?(tokens) || @mentions.include?(tokens)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    # Finds all relevant tokenized sentences to given input by
 | 
			
		||||
    # comparing non-stopword token overlaps
 | 
			
		||||
    def relevant_sentences(input)
 | 
			
		||||
    def find_relevant(sentences, input)
 | 
			
		||||
      relevant = []
 | 
			
		||||
      slightly_relevant = []
 | 
			
		||||
 | 
			
		||||
      tokenized = NLP.tokenize(input)
 | 
			
		||||
      tokenized = NLP.tokenize(input).map(&:downcase)
 | 
			
		||||
 | 
			
		||||
      @sentences.each do |sent|
 | 
			
		||||
      sentences.each do |sent|
 | 
			
		||||
        tokenized.each do |token|
 | 
			
		||||
          if sent.include?(token)
 | 
			
		||||
          if sent.map(&:downcase).include?(token)
 | 
			
		||||
            relevant << sent unless NLP.stopword?(token)
 | 
			
		||||
            slightly_relevant << sent
 | 
			
		||||
          end
 | 
			
		||||
| 
						 | 
				
			
			@ -115,9 +153,9 @@ module Ebooks
 | 
			
		|||
 | 
			
		||||
    # Generates a response by looking for related sentences
 | 
			
		||||
    # in the corpus and building a smaller generator from these
 | 
			
		||||
    def make_response(input, limit=140)
 | 
			
		||||
      # First try 
 | 
			
		||||
      relevant, slightly_relevant = relevant_sentences(input)
 | 
			
		||||
    def make_response(input, limit=140, sentences=@mentions)
 | 
			
		||||
      # Prefer mentions
 | 
			
		||||
      relevant, slightly_relevant = find_relevant(sentences, input)
 | 
			
		||||
 | 
			
		||||
      if relevant.length >= 3
 | 
			
		||||
        generator = SuffixGenerator.build(relevant)
 | 
			
		||||
| 
						 | 
				
			
			@ -125,6 +163,8 @@ module Ebooks
 | 
			
		|||
      elsif slightly_relevant.length >= 5
 | 
			
		||||
        generator = SuffixGenerator.build(slightly_relevant)
 | 
			
		||||
        make_statement(limit, generator)
 | 
			
		||||
      elsif sentences.equal?(@mentions)
 | 
			
		||||
        make_response(input, limit, @sentences)
 | 
			
		||||
      else
 | 
			
		||||
        make_statement(limit)
 | 
			
		||||
      end
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -44,7 +44,7 @@ module Ebooks
 | 
			
		|||
      verbatim = [tokens] # Verbatim sentences to avoid reproducing
 | 
			
		||||
 | 
			
		||||
      0.upto(passes-1) do
 | 
			
		||||
        puts NLP.reconstruct(tokens)
 | 
			
		||||
        log NLP.reconstruct(tokens) if $debug
 | 
			
		||||
        varsites = {} # Map bigram start site => next token alternatives
 | 
			
		||||
 | 
			
		||||
        tokens.each_with_index do |token, i|
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue