Github time!

2013-11-08 06:02:05 +11:00 · 2013-11-08 06:02:05 +11:00 · e87dc5862b
commit e87dc5862b
27 changed files with 20178 additions and 0 deletions
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -0,0 +1,154 @@
+# encoding: utf-8
+require 'fast-stemmer'
+require 'highscore'
+
+module Ebooks
+  module NLP
+    # We deliberately limit our punctuation handling to stuff we can do consistently
+    # It'll just be a part of another token if we don't split it out, and that's fine
+    PUNCTUATION = ".?!,"
+
+    # Lazy-load NLP libraries and resources
+    # Some of this stuff is pretty heavy and we don't necessarily need
+    # to be using it all of the time
+
+    def self.stopwords
+      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
+    end
+
+    def self.nouns
+      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
+    end
+
+    def self.adjectives
+      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
+    end
+
+    # POS tagger
+    def self.tagger
+      require 'engtagger'
+      @tagger ||= EngTagger.new
+    end
+
+    # Gingerice text correction service
+    def self.gingerice
+      require 'gingerice'
+      Gingerice::Parser.new # No caching for this one
+    end
+
+    # For decoding html entities
+    def self.htmlentities
+      require 'htmlentities'
+      @htmlentities ||= HTMLEntities.new
+    end
+
+    ### Utility functions
+    
+    # We don't really want to deal with all this weird unicode punctuation
+    def self.normalize(text)
+      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
+    end
+
+    # Split text into sentences
+    # We use ad hoc approach because fancy libraries do not deal
+    # especially well with tweet formatting, and we can fake solving
+    # the quote problem during generation
+    def self.sentences(text)
+      text.split(/\n+|(?<=[.?!])\s+/)
+    end
+
+    # Split a sentence into word-level tokens
+    # As above, this is ad hoc because tokenization libraries
+    # do not behave well wrt. things like emoticons and timestamps
+    def self.tokenize(sentence)
+      regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
+      sentence.split(regex)
+    end
+
+    def self.stem(word)
+      Stemmer::stem_word(word.downcase)
+    end
+
+    def self.keywords(sentences)
+      # Preprocess to remove stopwords (highscore's blacklist is v. slow)
+      text = sentences.flatten.reject { |t| stopword?(t) }.join(' ')
+
+      text = Highscore::Content.new(text)
+
+      text.configure do
+        #set :multiplier, 2
+        #set :upper_case, 3
+        #set :long_words, 2
+        #set :long_words_threshold, 15
+        #set :vowels, 1                     # => default: 0 = not considered
+        #set :consonants, 5                 # => default: 0 = not considered
+        #set :ignore_case, true             # => default: false
+        set :word_pattern, /(?<!@)(?<=\s)[\w']+/           # => default: /\w+/
+        #set :stemming, true                # => default: false
+      end
+
+      text.keywords
+    end
+
+    # Takes a list of tokens and builds a nice-looking sentence
+    def self.reconstruct(tokens)
+      text = ""
+      last_token = nil
+      tokens.each do |token|
+        next if token == INTERIM
+        text += ' ' if last_token && space_between?(last_token, token)
+        text += token
+        last_token = token
+      end
+      text
+    end
+
+    # Determine if we need to insert a space between two tokens
+    def self.space_between?(token1, token2)
+      p1 = self.punctuation?(token1)
+      p2 = self.punctuation?(token2)
+      if p1 && p2 # "foo?!"
+        false
+      elsif !p1 && p2 # "foo."
+        false
+      elsif p1 && !p2 # "foo. rah"
+        true
+      else # "foo rah"
+        true
+      end
+    end
+
+    def self.punctuation?(token)
+      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
+    end
+
+    def self.stopword?(token)
+      @stopword_set ||= stopwords.map(&:downcase).to_set
+      @stopword_set.include?(token.downcase)
+    end
+
+    # Determine if a sample of text contains unmatched brackets or quotes
+    # This is one of the more frequent and noticeable failure modes for
+    # the markov generator; we can just tell it to retry
+    def self.unmatched_enclosers?(text)
+      enclosers = ['**', '""', '()', '[]', '``', "''"]
+      enclosers.each do |pair|
+        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
+        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')
+
+        opened = 0
+
+        tokenize(text).each do |token|
+          opened += 1 if token.match(starter)
+          opened -= 1 if token.match(ender)
+
+          return true if opened < 0 # Too many ends!
+        end
+
+        return true if opened != 0 # Mismatch somewhere.
+      end
+
+      false
+    end
+  end
+end