Lots of documentation and cleanup

This commit is contained in:
Jaiden Mispy 2014-12-05 21:12:39 +11:00
parent efde0fd16f
commit 1977445b1c
11 changed files with 237 additions and 178 deletions

View file

@ -12,31 +12,35 @@ module Ebooks
# Some of this stuff is pretty heavy and we don't necessarily need
# to be using it all of the time
# Lazily loads an array of stopwords
# Stopwords are common English words that should often be ignored
# @return [Array<String>]
def self.stopwords
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
end
# Lazily loads an array of known English nouns
# @return [Array<String>]
def self.nouns
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
end
# Lazily loads an array of known English adjectives
# @return [Array<String>]
def self.adjectives
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
end
# POS tagger
# Lazily load part-of-speech tagging library
# This can determine whether a word is being used as a noun/adjective/verb
# @return [EngTagger]
def self.tagger
require 'engtagger'
@tagger ||= EngTagger.new
end
# Gingerice text correction service
def self.gingerice
require 'gingerice'
Gingerice::Parser.new # No caching for this one
end
# For decoding html entities
# Lazily load HTML entity decoder
# @return [HTMLEntities]
def self.htmlentities
require 'htmlentities'
@htmlentities ||= HTMLEntities.new
@ -44,7 +48,9 @@ module Ebooks
### Utility functions
# We don't really want to deal with all this weird unicode punctuation
# Normalize some strange unicode punctuation variants
# @param text [String]
# @return [String]
def self.normalize(text)
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('', "'").gsub('…', '...')
end
@ -53,6 +59,8 @@ module Ebooks
# We use ad hoc approach because fancy libraries do not deal
# especially well with tweet formatting, and we can fake solving
# the quote problem during generation
# @param text [String]
# @return [Array<String>]
def self.sentences(text)
text.split(/\n+|(?<=[.?!])\s+/)
end
@ -60,15 +68,23 @@ module Ebooks
# Split a sentence into word-level tokens
# As above, this is ad hoc because tokenization libraries
# do not behave well wrt. things like emoticons and timestamps
# @param sentence [String]
# @return [Array<String>]
def self.tokenize(sentence)
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
sentence.split(regex)
end
# Get the 'stem' form of a word e.g. 'cats' -> 'cat'
# @param word [String]
# @return [String]
def self.stem(word)
Stemmer::stem_word(word.downcase)
end
# Use highscore gem to find interesting keywords in a corpus
# @param text [String]
# @return [Highscore::Keywords]
def self.keywords(text)
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
@ -90,7 +106,10 @@ module Ebooks
text.keywords
end
# Takes a list of tokens and builds a nice-looking sentence
# Builds a proper sentence from a list of tikis
# @param tikis [Array<Integer>]
# @param tokens [Array<String>]
# @return [String]
def self.reconstruct(tikis, tokens)
text = ""
last_token = nil
@ -105,6 +124,9 @@ module Ebooks
end
# Determine if we need to insert a space between two tokens
# @param token1 [String]
# @param token2 [String]
# @return [Boolean]
def self.space_between?(token1, token2)
p1 = self.punctuation?(token1)
p2 = self.punctuation?(token2)
@ -119,10 +141,16 @@ module Ebooks
end
end
# Is this token comprised of punctuation?
# @param token [String]
# @return [Boolean]
def self.punctuation?(token)
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
end
# Is this token a stopword?
# @param token [String]
# @return [Boolean]
def self.stopword?(token)
@stopword_set ||= stopwords.map(&:downcase).to_set
@stopword_set.include?(token.downcase)
@ -130,7 +158,9 @@ module Ebooks
# Determine if a sample of text contains unmatched brackets or quotes
# This is one of the more frequent and noticeable failure modes for
# the markov generator; we can just tell it to retry
# the generator; we can just tell it to retry
# @param text [String]
# @return [Boolean]
def self.unmatched_enclosers?(text)
enclosers = ['**', '""', '()', '[]', '``', "''"]
enclosers.each do |pair|
@ -153,10 +183,13 @@ module Ebooks
end
# Determine if a2 is a subsequence of a1
# @param a1 [Array]
# @param a2 [Array]
# @return [Boolean]
def self.subseq?(a1, a2)
a1.each_index.find do |i|
!a1.each_index.find do |i|
a1[i...i+a2.length] == a2
end
end.nil?
end
end
end