Handle edge-case corpuses with short sentences

This commit is contained in:
Jaiden Mispy 2016-01-21 12:51:33 -08:00
parent dbae6d3499
commit a272bd69ca
3 changed files with 18 additions and 4 deletions

View file

@ -183,7 +183,7 @@ module Ebooks
lines = nil; statements = nil; mentions = nil # Allow garbage collection lines = nil; statements = nil; mentions = nil # Allow garbage collection
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
@sentences = mass_tikify(text) @sentences = mass_tikify(text)
@mentions = mass_tikify(mention_text) @mentions = mass_tikify(mention_text)
@ -254,7 +254,7 @@ module Ebooks
tweet = "" tweet = ""
while (tikis = generator.generate(3, :bigrams)) do while (tikis = generator.generate(3, :bigrams)) do
log "Attempting to produce tweet try #{retries+1}/#{retry_limit}" #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit) break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
retries += 1 retries += 1
@ -262,7 +262,7 @@ module Ebooks
end end
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}" #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
while (tikis = generator.generate(3, :unigrams)) do while (tikis = generator.generate(3, :unigrams)) do
break if valid_tweet?(tikis, limit) && !verbatim?(tikis) break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

View file

@ -14,7 +14,7 @@ module Ebooks
end end
def initialize(sentences) def initialize(sentences)
@sentences = sentences.reject { |s| s.length < 2 } @sentences = sentences
@unigrams = {} @unigrams = {}
@bigrams = {} @bigrams = {}

View file

@ -70,5 +70,19 @@ describe Ebooks::Model do
file.unlink file.unlink
end end
it 'handles strange unicode edge-cases' do
file = Tempfile.new('unicode')
file.write("💞\n💞")
file.close
model = Ebooks::Model.consume(file.path)
expect(model.mentions.count).to eq 0
expect(model.sentences.count).to eq 2
file.unlink
p model.make_statement
end
end end
end end