Handle edge-case corpuses with short sentences
This commit is contained in:
		
							parent
							
								
									dbae6d3499
								
							
						
					
					
						commit
						a272bd69ca
					
				
					 3 changed files with 18 additions and 4 deletions
				
			
		|  | @ -183,7 +183,7 @@ module Ebooks | |||
| 
 | ||||
|       lines = nil; statements = nil; mentions = nil # Allow garbage collection | ||||
| 
 | ||||
|       log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" | ||||
|       log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions" | ||||
| 
 | ||||
|       @sentences = mass_tikify(text) | ||||
|       @mentions = mass_tikify(mention_text) | ||||
|  | @ -254,7 +254,7 @@ module Ebooks | |||
|       tweet = "" | ||||
| 
 | ||||
|       while (tikis = generator.generate(3, :bigrams)) do | ||||
|         log "Attempting to produce tweet try #{retries+1}/#{retry_limit}" | ||||
|         #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}" | ||||
|         break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit) | ||||
| 
 | ||||
|         retries += 1 | ||||
|  | @ -262,7 +262,7 @@ module Ebooks | |||
|       end | ||||
| 
 | ||||
|       if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident | ||||
|         log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}" | ||||
|         #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}" | ||||
|         while (tikis = generator.generate(3, :unigrams)) do | ||||
|           break if valid_tweet?(tikis, limit) && !verbatim?(tikis) | ||||
| 
 | ||||
|  |  | |||
|  | @ -14,7 +14,7 @@ module Ebooks | |||
|     end | ||||
| 
 | ||||
|     def initialize(sentences) | ||||
|       @sentences = sentences.reject { |s| s.length < 2 } | ||||
|       @sentences = sentences | ||||
|       @unigrams = {} | ||||
|       @bigrams = {} | ||||
| 
 | ||||
|  |  | |||
|  | @ -70,5 +70,19 @@ describe Ebooks::Model do | |||
| 
 | ||||
|       file.unlink | ||||
|     end | ||||
| 
 | ||||
|     it 'handles strange unicode edge-cases' do | ||||
|       file = Tempfile.new('unicode') | ||||
|       file.write("💞\n💞") | ||||
|       file.close | ||||
| 
 | ||||
|       model = Ebooks::Model.consume(file.path) | ||||
|       expect(model.mentions.count).to eq 0 | ||||
|       expect(model.sentences.count).to eq 2 | ||||
| 
 | ||||
|       file.unlink | ||||
| 
 | ||||
|       p model.make_statement | ||||
|     end | ||||
|   end | ||||
| end | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Jaiden Mispy
						Jaiden Mispy