Handle edge-case corpuses with short sentences
This commit is contained in:
		
							parent
							
								
									dbae6d3499
								
							
						
					
					
						commit
						a272bd69ca
					
				
					 3 changed files with 18 additions and 4 deletions
				
			
		|  | @ -183,7 +183,7 @@ module Ebooks | ||||||
| 
 | 
 | ||||||
|       lines = nil; statements = nil; mentions = nil # Allow garbage collection |       lines = nil; statements = nil; mentions = nil # Allow garbage collection | ||||||
| 
 | 
 | ||||||
|       log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" |       log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions" | ||||||
| 
 | 
 | ||||||
|       @sentences = mass_tikify(text) |       @sentences = mass_tikify(text) | ||||||
|       @mentions = mass_tikify(mention_text) |       @mentions = mass_tikify(mention_text) | ||||||
|  | @ -254,7 +254,7 @@ module Ebooks | ||||||
|       tweet = "" |       tweet = "" | ||||||
| 
 | 
 | ||||||
|       while (tikis = generator.generate(3, :bigrams)) do |       while (tikis = generator.generate(3, :bigrams)) do | ||||||
|         log "Attempting to produce tweet try #{retries+1}/#{retry_limit}" |         #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}" | ||||||
|         break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit) |         break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit) | ||||||
| 
 | 
 | ||||||
|         retries += 1 |         retries += 1 | ||||||
|  | @ -262,7 +262,7 @@ module Ebooks | ||||||
|       end |       end | ||||||
| 
 | 
 | ||||||
|       if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident |       if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident | ||||||
|         log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}" |         #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}" | ||||||
|         while (tikis = generator.generate(3, :unigrams)) do |         while (tikis = generator.generate(3, :unigrams)) do | ||||||
|           break if valid_tweet?(tikis, limit) && !verbatim?(tikis) |           break if valid_tweet?(tikis, limit) && !verbatim?(tikis) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -14,7 +14,7 @@ module Ebooks | ||||||
|     end |     end | ||||||
| 
 | 
 | ||||||
|     def initialize(sentences) |     def initialize(sentences) | ||||||
|       @sentences = sentences.reject { |s| s.length < 2 } |       @sentences = sentences | ||||||
|       @unigrams = {} |       @unigrams = {} | ||||||
|       @bigrams = {} |       @bigrams = {} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -70,5 +70,19 @@ describe Ebooks::Model do | ||||||
| 
 | 
 | ||||||
|       file.unlink |       file.unlink | ||||||
|     end |     end | ||||||
|  | 
 | ||||||
|  |     it 'handles strange unicode edge-cases' do | ||||||
|  |       file = Tempfile.new('unicode') | ||||||
|  |       file.write("💞\n💞") | ||||||
|  |       file.close | ||||||
|  | 
 | ||||||
|  |       model = Ebooks::Model.consume(file.path) | ||||||
|  |       expect(model.mentions.count).to eq 0 | ||||||
|  |       expect(model.sentences.count).to eq 2 | ||||||
|  | 
 | ||||||
|  |       file.unlink | ||||||
|  | 
 | ||||||
|  |       p model.make_statement | ||||||
|  |     end | ||||||
|   end |   end | ||||||
| end | end | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Jaiden Mispy
						Jaiden Mispy