Switch to using token indexes instead of strings
This commit is contained in:
parent
6ae1dd5dac
commit
3b1d6f856d
4 changed files with 79 additions and 49 deletions
|
@ -69,9 +69,9 @@ module Ebooks
|
|||
Stemmer::stem_word(word.downcase)
|
||||
end
|
||||
|
||||
def self.keywords(sentences)
|
||||
def self.keywords(text)
|
||||
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
||||
text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
|
||||
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
||||
|
||||
text = Highscore::Content.new(text)
|
||||
|
||||
|
@ -91,11 +91,12 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Takes a list of tokens and builds a nice-looking sentence
|
||||
def self.reconstruct(tokens)
|
||||
def self.reconstruct(tikis, tokens)
|
||||
text = ""
|
||||
last_token = nil
|
||||
tokens.each do |token|
|
||||
next if token == INTERIM
|
||||
tikis.each do |tiki|
|
||||
next if tiki == INTERIM
|
||||
token = tokens[tiki]
|
||||
text += ' ' if last_token && space_between?(last_token, token)
|
||||
text += token
|
||||
last_token = token
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue