Handle edge-case corpuses with short sentences

This commit is contained in:
Jaiden Mispy 2016-01-21 12:51:33 -08:00
parent dbae6d3499
commit a272bd69ca
3 changed files with 18 additions and 4 deletions

View file

@ -183,7 +183,7 @@ module Ebooks
lines = nil; statements = nil; mentions = nil # Allow garbage collection
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
@sentences = mass_tikify(text)
@mentions = mass_tikify(mention_text)
@ -254,7 +254,7 @@ module Ebooks
tweet = ""
while (tikis = generator.generate(3, :bigrams)) do
log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
#log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
retries += 1
@ -262,7 +262,7 @@ module Ebooks
end
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
#log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
while (tikis = generator.generate(3, :unigrams)) do
break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

View file

@ -14,7 +14,7 @@ module Ebooks
end
def initialize(sentences)
@sentences = sentences.reject { |s| s.length < 2 }
@sentences = sentences
@unigrams = {}
@bigrams = {}

View file

@ -70,5 +70,19 @@ describe Ebooks::Model do
file.unlink
end
it 'handles strange unicode edge-cases' do
file = Tempfile.new('unicode')
file.write("💞\n💞")
file.close
model = Ebooks::Model.consume(file.path)
expect(model.mentions.count).to eq 0
expect(model.sentences.count).to eq 2
file.unlink
p model.make_statement
end
end
end