2013-11-08 06:02:05 +11:00
|
|
|
|
# encoding: utf-8
|
|
|
|
|
require 'fast-stemmer'
|
|
|
|
|
require 'highscore'
|
2015-06-04 10:46:01 -06:00
|
|
|
|
require 'htmlentities'
|
2013-11-08 06:02:05 +11:00
|
|
|
|
|
|
|
|
|
module Ebooks
|
|
|
|
|
module NLP
|
|
|
|
|
# We deliberately limit our punctuation handling to stuff we can do consistently
|
|
|
|
|
# It'll just be a part of another token if we don't split it out, and that's fine
|
|
|
|
|
PUNCTUATION = ".?!,"
|
|
|
|
|
|
|
|
|
|
# Lazy-load NLP libraries and resources
|
|
|
|
|
# Some of this stuff is pretty heavy and we don't necessarily need
|
|
|
|
|
# to be using it all of the time
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Lazily loads an array of stopwords
|
2015-09-29 17:17:30 +02:00
|
|
|
|
# Stopwords are common words that should often be ignored
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# @return [Array<String>]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.stopwords
|
2015-09-29 17:17:30 +02:00
|
|
|
|
@stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : []
|
2013-11-08 06:02:05 +11:00
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Lazily loads an array of known English nouns
|
|
|
|
|
# @return [Array<String>]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.nouns
|
|
|
|
|
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Lazily loads an array of known English adjectives
|
|
|
|
|
# @return [Array<String>]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.adjectives
|
|
|
|
|
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Lazily load part-of-speech tagging library
|
|
|
|
|
# This can determine whether a word is being used as a noun/adjective/verb
|
|
|
|
|
# @return [EngTagger]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.tagger
|
|
|
|
|
require 'engtagger'
|
|
|
|
|
@tagger ||= EngTagger.new
|
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Lazily load HTML entity decoder
|
|
|
|
|
# @return [HTMLEntities]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.htmlentities
|
|
|
|
|
@htmlentities ||= HTMLEntities.new
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
### Utility functions
|
2014-10-16 03:02:39 -07:00
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Normalize some strange unicode punctuation variants
|
|
|
|
|
# @param text [String]
|
|
|
|
|
# @return [String]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.normalize(text)
|
|
|
|
|
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
# Split text into sentences
|
|
|
|
|
# We use ad hoc approach because fancy libraries do not deal
|
|
|
|
|
# especially well with tweet formatting, and we can fake solving
|
|
|
|
|
# the quote problem during generation
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# @param text [String]
|
|
|
|
|
# @return [Array<String>]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.sentences(text)
|
|
|
|
|
text.split(/\n+|(?<=[.?!])\s+/)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
# Split a sentence into word-level tokens
|
|
|
|
|
# As above, this is ad hoc because tokenization libraries
|
|
|
|
|
# do not behave well wrt. things like emoticons and timestamps
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# @param sentence [String]
|
|
|
|
|
# @return [Array<String>]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.tokenize(sentence)
|
2013-11-14 07:44:05 -08:00
|
|
|
|
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
2013-11-08 06:02:05 +11:00
|
|
|
|
sentence.split(regex)
|
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Get the 'stem' form of a word e.g. 'cats' -> 'cat'
|
|
|
|
|
# @param word [String]
|
|
|
|
|
# @return [String]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.stem(word)
|
|
|
|
|
Stemmer::stem_word(word.downcase)
|
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Use highscore gem to find interesting keywords in a corpus
|
|
|
|
|
# @param text [String]
|
|
|
|
|
# @return [Highscore::Keywords]
|
2014-10-24 09:55:49 -07:00
|
|
|
|
def self.keywords(text)
|
2013-11-08 06:02:05 +11:00
|
|
|
|
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
2014-10-24 09:55:49 -07:00
|
|
|
|
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
2013-11-08 06:02:05 +11:00
|
|
|
|
|
|
|
|
|
text = Highscore::Content.new(text)
|
|
|
|
|
|
|
|
|
|
text.configure do
|
|
|
|
|
#set :multiplier, 2
|
|
|
|
|
#set :upper_case, 3
|
|
|
|
|
#set :long_words, 2
|
|
|
|
|
#set :long_words_threshold, 15
|
|
|
|
|
#set :vowels, 1 # => default: 0 = not considered
|
|
|
|
|
#set :consonants, 5 # => default: 0 = not considered
|
|
|
|
|
#set :ignore_case, true # => default: false
|
2015-09-29 16:52:46 +02:00
|
|
|
|
set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
|
2013-11-08 06:02:05 +11:00
|
|
|
|
#set :stemming, true # => default: false
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
text.keywords
|
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Builds a proper sentence from a list of tikis
|
|
|
|
|
# @param tikis [Array<Integer>]
|
|
|
|
|
# @param tokens [Array<String>]
|
|
|
|
|
# @return [String]
|
2014-10-24 09:55:49 -07:00
|
|
|
|
def self.reconstruct(tikis, tokens)
|
2013-11-08 06:02:05 +11:00
|
|
|
|
text = ""
|
|
|
|
|
last_token = nil
|
2014-10-24 09:55:49 -07:00
|
|
|
|
tikis.each do |tiki|
|
|
|
|
|
next if tiki == INTERIM
|
|
|
|
|
token = tokens[tiki]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
text += ' ' if last_token && space_between?(last_token, token)
|
|
|
|
|
text += token
|
|
|
|
|
last_token = token
|
|
|
|
|
end
|
|
|
|
|
text
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
# Determine if we need to insert a space between two tokens
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# @param token1 [String]
|
|
|
|
|
# @param token2 [String]
|
|
|
|
|
# @return [Boolean]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.space_between?(token1, token2)
|
|
|
|
|
p1 = self.punctuation?(token1)
|
|
|
|
|
p2 = self.punctuation?(token2)
|
|
|
|
|
if p1 && p2 # "foo?!"
|
|
|
|
|
false
|
|
|
|
|
elsif !p1 && p2 # "foo."
|
|
|
|
|
false
|
|
|
|
|
elsif p1 && !p2 # "foo. rah"
|
|
|
|
|
true
|
|
|
|
|
else # "foo rah"
|
|
|
|
|
true
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Is this token comprised of punctuation?
|
|
|
|
|
# @param token [String]
|
|
|
|
|
# @return [Boolean]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.punctuation?(token)
|
|
|
|
|
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
|
|
|
|
end
|
|
|
|
|
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# Is this token a stopword?
|
|
|
|
|
# @param token [String]
|
|
|
|
|
# @return [Boolean]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.stopword?(token)
|
|
|
|
|
@stopword_set ||= stopwords.map(&:downcase).to_set
|
|
|
|
|
@stopword_set.include?(token.downcase)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
# Determine if a sample of text contains unmatched brackets or quotes
|
|
|
|
|
# This is one of the more frequent and noticeable failure modes for
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# the generator; we can just tell it to retry
|
|
|
|
|
# @param text [String]
|
|
|
|
|
# @return [Boolean]
|
2013-11-08 06:02:05 +11:00
|
|
|
|
def self.unmatched_enclosers?(text)
|
|
|
|
|
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
|
|
|
|
enclosers.each do |pair|
|
|
|
|
|
starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
|
|
|
|
|
ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
|
|
|
|
|
|
|
|
|
|
opened = 0
|
|
|
|
|
|
|
|
|
|
tokenize(text).each do |token|
|
|
|
|
|
opened += 1 if token.match(starter)
|
|
|
|
|
opened -= 1 if token.match(ender)
|
|
|
|
|
|
|
|
|
|
return true if opened < 0 # Too many ends!
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
return true if opened != 0 # Mismatch somewhere.
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
false
|
|
|
|
|
end
|
2013-11-14 07:44:05 -08:00
|
|
|
|
|
|
|
|
|
# Determine if a2 is a subsequence of a1
|
2014-12-05 21:12:39 +11:00
|
|
|
|
# @param a1 [Array]
|
|
|
|
|
# @param a2 [Array]
|
|
|
|
|
# @return [Boolean]
|
2013-11-14 07:44:05 -08:00
|
|
|
|
def self.subseq?(a1, a2)
|
2014-12-05 21:12:39 +11:00
|
|
|
|
!a1.each_index.find do |i|
|
2013-11-14 07:44:05 -08:00
|
|
|
|
a1[i...i+a2.length] == a2
|
2014-12-05 21:12:39 +11:00
|
|
|
|
end.nil?
|
2013-11-14 07:44:05 -08:00
|
|
|
|
end
|
2013-11-08 06:02:05 +11:00
|
|
|
|
end
|
|
|
|
|
end
|