Lots of documentation and cleanup

2014-12-05 21:12:39 +11:00 · 2014-12-05 21:12:39 +11:00 · 1977445b1c
commit 1977445b1c
parent efde0fd16f
11 changed files with 237 additions and 178 deletions
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -12,31 +12,35 @@ module Ebooks
    # Some of this stuff is pretty heavy and we don't necessarily need
    # to be using it all of the time

+    # Lazily loads an array of stopwords
+    # Stopwords are common English words that should often be ignored
+    # @return [Array<String>]
    def self.stopwords
      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
    end

+    # Lazily loads an array of known English nouns
+    # @return [Array<String>]
    def self.nouns
      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
    end

+    # Lazily loads an array of known English adjectives
+    # @return [Array<String>]
    def self.adjectives
      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
    end

-    # POS tagger
+    # Lazily load part-of-speech tagging library
+    # This can determine whether a word is being used as a noun/adjective/verb
+    # @return [EngTagger]
    def self.tagger
      require 'engtagger'
      @tagger ||= EngTagger.new
    end

-    # Gingerice text correction service
-    def self.gingerice
-      require 'gingerice'
-      Gingerice::Parser.new # No caching for this one
-    end
-
-    # For decoding html entities
+    # Lazily load HTML entity decoder
+    # @return [HTMLEntities]
    def self.htmlentities
      require 'htmlentities'
      @htmlentities ||= HTMLEntities.new
@ -44,7 +48,9 @@ module Ebooks

    ### Utility functions

-    # We don't really want to deal with all this weird unicode punctuation
+    # Normalize some strange unicode punctuation variants
+    # @param text [String]
+    # @return [String]
    def self.normalize(text)
      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
    end
@ -53,6 +59,8 @@ module Ebooks
    # We use ad hoc approach because fancy libraries do not deal
    # especially well with tweet formatting, and we can fake solving
    # the quote problem during generation
+    # @param text [String]
+    # @return [Array<String>]
    def self.sentences(text)
      text.split(/\n+|(?<=[.?!])\s+/)
    end
@ -60,15 +68,23 @@ module Ebooks
    # Split a sentence into word-level tokens
    # As above, this is ad hoc because tokenization libraries
    # do not behave well wrt. things like emoticons and timestamps
+    # @param sentence [String]
+    # @return [Array<String>]
    def self.tokenize(sentence)
      regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
      sentence.split(regex)
    end

+    # Get the 'stem' form of a word e.g. 'cats' -> 'cat'
+    # @param word [String]
+    # @return [String]
    def self.stem(word)
      Stemmer::stem_word(word.downcase)
    end

+    # Use highscore gem to find interesting keywords in a corpus
+    # @param text [String]
+    # @return [Highscore::Keywords]
    def self.keywords(text)
      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
      text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
@ -90,7 +106,10 @@ module Ebooks
      text.keywords
    end

-    # Takes a list of tokens and builds a nice-looking sentence
+    # Builds a proper sentence from a list of tikis
+    # @param tikis [Array<Integer>]
+    # @param tokens [Array<String>]
+    # @return [String]
    def self.reconstruct(tikis, tokens)
      text = ""
      last_token = nil
@ -105,6 +124,9 @@ module Ebooks
    end

    # Determine if we need to insert a space between two tokens
+    # @param token1 [String]
+    # @param token2 [String]
+    # @return [Boolean]
    def self.space_between?(token1, token2)
      p1 = self.punctuation?(token1)
      p2 = self.punctuation?(token2)
@ -119,10 +141,16 @@ module Ebooks
      end
    end

+    # Is this token comprised of punctuation?
+    # @param token [String]
+    # @return [Boolean]
    def self.punctuation?(token)
      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
    end

+    # Is this token a stopword?
+    # @param token [String]
+    # @return [Boolean]
    def self.stopword?(token)
      @stopword_set ||= stopwords.map(&:downcase).to_set
      @stopword_set.include?(token.downcase)
@ -130,7 +158,9 @@ module Ebooks

    # Determine if a sample of text contains unmatched brackets or quotes
    # This is one of the more frequent and noticeable failure modes for
-    # the markov generator; we can just tell it to retry
+    # the generator; we can just tell it to retry
+    # @param text [String]
+    # @return [Boolean]
    def self.unmatched_enclosers?(text)
      enclosers = ['**', '""', '()', '[]', '``', "''"]
      enclosers.each do |pair|
@ -153,10 +183,13 @@ module Ebooks
    end

    # Determine if a2 is a subsequence of a1
+    # @param a1 [Array]
+    # @param a2 [Array]
+    # @return [Boolean]
    def self.subseq?(a1, a2)
-      a1.each_index.find do |i|
+      !a1.each_index.find do |i|
        a1[i...i+a2.length] == a2
-      end
+      end.nil?
    end
  end
 end