Lots of documentation and cleanup
This commit is contained in:
parent
efde0fd16f
commit
1977445b1c
11 changed files with 237 additions and 178 deletions
|
@ -12,31 +12,35 @@ module Ebooks
|
|||
# Some of this stuff is pretty heavy and we don't necessarily need
|
||||
# to be using it all of the time
|
||||
|
||||
# Lazily loads an array of stopwords
|
||||
# Stopwords are common English words that should often be ignored
|
||||
# @return [Array<String>]
|
||||
def self.stopwords
|
||||
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
||||
end
|
||||
|
||||
# Lazily loads an array of known English nouns
|
||||
# @return [Array<String>]
|
||||
def self.nouns
|
||||
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
||||
end
|
||||
|
||||
# Lazily loads an array of known English adjectives
|
||||
# @return [Array<String>]
|
||||
def self.adjectives
|
||||
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
||||
end
|
||||
|
||||
# POS tagger
|
||||
# Lazily load part-of-speech tagging library
|
||||
# This can determine whether a word is being used as a noun/adjective/verb
|
||||
# @return [EngTagger]
|
||||
def self.tagger
|
||||
require 'engtagger'
|
||||
@tagger ||= EngTagger.new
|
||||
end
|
||||
|
||||
# Gingerice text correction service
|
||||
def self.gingerice
|
||||
require 'gingerice'
|
||||
Gingerice::Parser.new # No caching for this one
|
||||
end
|
||||
|
||||
# For decoding html entities
|
||||
# Lazily load HTML entity decoder
|
||||
# @return [HTMLEntities]
|
||||
def self.htmlentities
|
||||
require 'htmlentities'
|
||||
@htmlentities ||= HTMLEntities.new
|
||||
|
@ -44,7 +48,9 @@ module Ebooks
|
|||
|
||||
### Utility functions
|
||||
|
||||
# We don't really want to deal with all this weird unicode punctuation
|
||||
# Normalize some strange unicode punctuation variants
|
||||
# @param text [String]
|
||||
# @return [String]
|
||||
def self.normalize(text)
|
||||
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
||||
end
|
||||
|
@ -53,6 +59,8 @@ module Ebooks
|
|||
# We use ad hoc approach because fancy libraries do not deal
|
||||
# especially well with tweet formatting, and we can fake solving
|
||||
# the quote problem during generation
|
||||
# @param text [String]
|
||||
# @return [Array<String>]
|
||||
def self.sentences(text)
|
||||
text.split(/\n+|(?<=[.?!])\s+/)
|
||||
end
|
||||
|
@ -60,15 +68,23 @@ module Ebooks
|
|||
# Split a sentence into word-level tokens
|
||||
# As above, this is ad hoc because tokenization libraries
|
||||
# do not behave well wrt. things like emoticons and timestamps
|
||||
# @param sentence [String]
|
||||
# @return [Array<String>]
|
||||
def self.tokenize(sentence)
|
||||
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
||||
sentence.split(regex)
|
||||
end
|
||||
|
||||
# Get the 'stem' form of a word e.g. 'cats' -> 'cat'
|
||||
# @param word [String]
|
||||
# @return [String]
|
||||
def self.stem(word)
|
||||
Stemmer::stem_word(word.downcase)
|
||||
end
|
||||
|
||||
# Use highscore gem to find interesting keywords in a corpus
|
||||
# @param text [String]
|
||||
# @return [Highscore::Keywords]
|
||||
def self.keywords(text)
|
||||
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
||||
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
||||
|
@ -90,7 +106,10 @@ module Ebooks
|
|||
text.keywords
|
||||
end
|
||||
|
||||
# Takes a list of tokens and builds a nice-looking sentence
|
||||
# Builds a proper sentence from a list of tikis
|
||||
# @param tikis [Array<Integer>]
|
||||
# @param tokens [Array<String>]
|
||||
# @return [String]
|
||||
def self.reconstruct(tikis, tokens)
|
||||
text = ""
|
||||
last_token = nil
|
||||
|
@ -105,6 +124,9 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Determine if we need to insert a space between two tokens
|
||||
# @param token1 [String]
|
||||
# @param token2 [String]
|
||||
# @return [Boolean]
|
||||
def self.space_between?(token1, token2)
|
||||
p1 = self.punctuation?(token1)
|
||||
p2 = self.punctuation?(token2)
|
||||
|
@ -119,10 +141,16 @@ module Ebooks
|
|||
end
|
||||
end
|
||||
|
||||
# Is this token comprised of punctuation?
|
||||
# @param token [String]
|
||||
# @return [Boolean]
|
||||
def self.punctuation?(token)
|
||||
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
||||
end
|
||||
|
||||
# Is this token a stopword?
|
||||
# @param token [String]
|
||||
# @return [Boolean]
|
||||
def self.stopword?(token)
|
||||
@stopword_set ||= stopwords.map(&:downcase).to_set
|
||||
@stopword_set.include?(token.downcase)
|
||||
|
@ -130,7 +158,9 @@ module Ebooks
|
|||
|
||||
# Determine if a sample of text contains unmatched brackets or quotes
|
||||
# This is one of the more frequent and noticeable failure modes for
|
||||
# the markov generator; we can just tell it to retry
|
||||
# the generator; we can just tell it to retry
|
||||
# @param text [String]
|
||||
# @return [Boolean]
|
||||
def self.unmatched_enclosers?(text)
|
||||
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
||||
enclosers.each do |pair|
|
||||
|
@ -153,10 +183,13 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Determine if a2 is a subsequence of a1
|
||||
# @param a1 [Array]
|
||||
# @param a2 [Array]
|
||||
# @return [Boolean]
|
||||
def self.subseq?(a1, a2)
|
||||
a1.each_index.find do |i|
|
||||
!a1.each_index.find do |i|
|
||||
a1[i...i+a2.length] == a2
|
||||
end
|
||||
end.nil?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue