twitter-ebooks/lib/twitter_ebooks/model.rb

#!/usr/bin/env ruby
# encoding: utf-8

require 'json'
require 'set'
require 'digest/md5'
require 'csv'

module Ebooks
  class Model
    attr_accessor :hash, :tokens, :sentences, :mentions, :keywords

    def self.consume(txtpath)
      Model.new.consume(txtpath)
    end

    def self.load(path)
      model = Model.new
      model.instance_eval do
        props = Marshal.load(File.open(path, 'rb') { |f| f.read })
        @tokens = props[:tokens]
        @sentences = props[:sentences]
        @mentions = props[:mentions]
        @keywords = props[:keywords]
      end
      model
    end

    def save(path)
      File.open(path, 'wb') do |f|
        f.write(Marshal.dump({
          tokens: @tokens,
          sentences: @sentences,
          mentions: @mentions,
          keywords: @keywords
        }))
      end
      self
    end

    def initialize
      # This is the only source of actual strings in the model. It is
      # an array of unique tokens. Manipulation of a token is mostly done
      # using its index in this array, which we call a "tiki"
      @tokens = []

      # Reverse lookup tiki by token, for faster generation
      @tikis = {}
    end

    def tikify(token)
      @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
    end

    def mass_tikify(text)
      sentences = NLP.sentences(text)

      sentences.map do |s|
        tokens = NLP.tokenize(s).reject do |t|
          # Don't include usernames/urls as tokens
          t.include?('@') || t.include?('http')
        end

        tokens.map { |t| tikify(t) }
      end
    end

    def consume(path)
      content = File.read(path, :encoding => 'utf-8')
      @hash = Digest::MD5.hexdigest(content)

      if path.split('.')[-1] == "json"
        log "Reading json corpus from #{path}"
        lines = JSON.parse(content).map do |tweet|
          tweet['text']
        end
      elsif path.split('.')[-1] == "csv"
        log "Reading CSV corpus from #{path}"
        content = CSV.parse(content)
        header = content.shift
        text_col = header.index('text')
        lines = content.map do |tweet|
          tweet[text_col]
        end
      else
        log "Reading plaintext corpus from #{path}"
        lines = content.split("\n")
      end

      log "Removing commented lines and sorting mentions"

      statements = []
      mentions = []
      lines.each do |l|
        next if l.start_with?('#') # Remove commented lines
        next if l.include?('RT') || l.include?('MT') # Remove soft retweets

        if l.include?('@')
          mentions << NLP.normalize(l)
        else
          statements << NLP.normalize(l)
        end
      end

      text = statements.join("\n")
      mention_text = mentions.join("\n")

      lines = nil; statements = nil; mentions = nil # Allow garbage collection

      log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"

      @sentences = mass_tikify(text)
      @mentions = mass_tikify(mention_text)

      log "Ranking keywords"
      @keywords = NLP.keywords(text).top(200).map(&:to_s)

      self
    end

    def fix(tweet)
      # This seems to require an external api call
      #begin
      #  fixer = NLP.gingerice.parse(tweet)
      #  log fixer if fixer['corrections']
      #  tweet = fixer['result']
      #rescue Exception => e
      #  log e.message
      #  log e.backtrace
      #end

      NLP.htmlentities.decode tweet
    end

    def valid_tweet?(tikis, limit)
      tweet = NLP.reconstruct(tikis, @tokens)
      tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
    end

    def make_statement(limit=140, generator=nil, retry_limit=10)
      responding = !generator.nil?
      generator ||= SuffixGenerator.build(@sentences)

      retries = 0
      tweet = ""

      while (tikis = generator.generate(3, :bigrams)) do
        next if tikis.length <= 3 && !responding
        break if valid_tweet?(tikis, limit)

        retries += 1
        break if retries >= retry_limit
      end

      if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
        while (tikis = generator.generate(3, :unigrams)) do
          break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

          retries += 1
          break if retries >= retry_limit
        end
      end

      tweet = NLP.reconstruct(tikis, @tokens)

      if retries >= retry_limit
        log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
      end

      fix tweet
    end

    # Test if a sentence has been copied verbatim from original
    def verbatim?(tokens)
      @sentences.include?(tokens) || @mentions.include?(tokens)
    end

    # Finds all relevant tokenized sentences to given input by
    # comparing non-stopword token overlaps
    def find_relevant(sentences, input)
      relevant = []
      slightly_relevant = []

      tokenized = NLP.tokenize(input).map(&:downcase)

      sentences.each do |sent|
        tokenized.each do |token|
          if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
            relevant << sent unless NLP.stopword?(token)
            slightly_relevant << sent
          end
        end
      end

      [relevant, slightly_relevant]
    end

    # Generates a response by looking for related sentences
    # in the corpus and building a smaller generator from these
    def make_response(input, limit=140, sentences=@mentions)
      # Prefer mentions
      relevant, slightly_relevant = find_relevant(sentences, input)

      if relevant.length >= 3
        generator = SuffixGenerator.build(relevant)
        make_statement(limit, generator)
      elsif slightly_relevant.length >= 5
        generator = SuffixGenerator.build(slightly_relevant)
        make_statement(limit, generator)
      elsif sentences.equal?(@mentions)
        make_response(input, limit, @sentences)
      else
        make_statement(limit)
      end
    end
  end
end
Github time! 2013-11-08 06:02:05 +11:00			`#!/usr/bin/env ruby`
			`# encoding: utf-8`

			`require 'json'`
			`require 'set'`
			`require 'digest/md5'`
Support consuming tweets.csv from official twitter archives 2014-04-30 20:30:54 -04:00			`require 'csv'`
Github time! 2013-11-08 06:02:05 +11:00
			`module Ebooks`
			`class Model`
grr stupid mistake 2014-10-25 05:48:13 -07:00			`attr_accessor :hash, :tokens, :sentences, :mentions, :keywords`
Github time! 2013-11-08 06:02:05 +11:00
			`def self.consume(txtpath)`
			`Model.new.consume(txtpath)`
			`end`

			`def self.load(path)`
grr stupid mistake 2014-10-25 05:48:13 -07:00			`model = Model.new`
			`model.instance_eval do`
			`props = Marshal.load(File.open(path, 'rb') { \|f\| f.read })`
			`@tokens = props[:tokens]`
			`@sentences = props[:sentences]`
			`@mentions = props[:mentions]`
			`@keywords = props[:keywords]`
			`end`
			`model`
Save only necessary data into model 2014-10-25 04:26:52 -07:00			`end`

			`def save(path)`
			`File.open(path, 'wb') do \|f\|`
			`f.write(Marshal.dump({`
			`tokens: @tokens,`
			`sentences: @sentences,`
			`mentions: @mentions,`
			`keywords: @keywords`
			`}))`
			`end`
			`self`
Github time! 2013-11-08 06:02:05 +11:00			`end`

Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`def initialize`
			`# This is the only source of actual strings in the model. It is`
			`# an array of unique tokens. Manipulation of a token is mostly done`
			`# using its index in this array, which we call a "tiki"`
			`@tokens = []`

			`# Reverse lookup tiki by token, for faster generation`
			`@tikis = {}`
			`end`

			`def tikify(token)`
			`@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)`
			`end`

			`def mass_tikify(text)`
Memory optimization 2014-10-16 03:02:39 -07:00			`sentences = NLP.sentences(text)`

Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`sentences.map do \|s\|`
			`tokens = NLP.tokenize(s).reject do \|t\|`
Memory optimization 2014-10-16 03:02:39 -07:00			`# Don't include usernames/urls as tokens`
			`t.include?('@') \|\| t.include?('http')`
			`end`

Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`tokens.map { \|t\| tikify(t) }`
			`end`
Memory optimization 2014-10-16 03:02:39 -07:00			`end`

Allow consumption of json archives 2013-11-27 05:12:54 -08:00			`def consume(path)`
MODEL: Read in utf-8, only parse CSV once Ran into `Encoding::CompatibilityError` issue trying to consume my corpus (tweets.csv) on Windows 7, but this likely affects other environments as well. Fix: force reading corpus file contents as utf-8. Also a quick clean-up of the CSV flow to only parse the content once instead of double-dipping. 2014-06-27 18:42:51 -04:00			`content = File.read(path, :encoding => 'utf-8')`
Allow consumption of json archives 2013-11-27 05:12:54 -08:00			`@hash = Digest::MD5.hexdigest(content)`

			`if path.split('.')[-1] == "json"`
			`log "Reading json corpus from #{path}"`
Memory optimization 2014-10-16 03:02:39 -07:00			`lines = JSON.parse(content).map do \|tweet\|`
			`tweet['text']`
Allow consumption of json archives 2013-11-27 05:12:54 -08:00			`end`
Support consuming tweets.csv from official twitter archives 2014-04-30 20:30:54 -04:00			`elsif path.split('.')[-1] == "csv"`
			`log "Reading CSV corpus from #{path}"`
MODEL: Read in utf-8, only parse CSV once Ran into `Encoding::CompatibilityError` issue trying to consume my corpus (tweets.csv) on Windows 7, but this likely affects other environments as well. Fix: force reading corpus file contents as utf-8. Also a quick clean-up of the CSV flow to only parse the content once instead of double-dipping. 2014-06-27 18:42:51 -04:00			`content = CSV.parse(content)`
			`header = content.shift`
csv import now looks for text column 2014-05-03 16:44:07 -06:00			`text_col = header.index('text')`
MODEL: Read in utf-8, only parse CSV once Ran into `Encoding::CompatibilityError` issue trying to consume my corpus (tweets.csv) on Windows 7, but this likely affects other environments as well. Fix: force reading corpus file contents as utf-8. Also a quick clean-up of the CSV flow to only parse the content once instead of double-dipping. 2014-06-27 18:42:51 -04:00			`lines = content.map do \|tweet\|`
csv import now looks for text column 2014-05-03 16:44:07 -06:00			`tweet[text_col]`
Support consuming tweets.csv from official twitter archives 2014-04-30 20:30:54 -04:00			`end`
Allow consumption of json archives 2013-11-27 05:12:54 -08:00			`else`
			`log "Reading plaintext corpus from #{path}"`
			`lines = content.split("\n")`
			`end`
Github time! 2013-11-08 06:02:05 +11:00
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`log "Removing commented lines and sorting mentions"`
Github time! 2013-11-08 06:02:05 +11:00
Memory optimization 2014-10-16 03:02:39 -07:00			`statements = []`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`mentions = []`
Github time! 2013-11-08 06:02:05 +11:00			`lines.each do \|l\|`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`next if l.start_with?('#') # Remove commented lines`
			`next if l.include?('RT') \|\| l.include?('MT') # Remove soft retweets`
rspec and memory_profiler 2014-10-14 01:02:08 -07:00
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`if l.include?('@')`
Memory optimization 2014-10-16 03:02:39 -07:00			`mentions << NLP.normalize(l)`
Fix parser swapping mentions and sentences 2014-10-19 22:33:17 -07:00			`else`
			`statements << NLP.normalize(l)`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`end`
Github time! 2013-11-08 06:02:05 +11:00			`end`

Memory optimization 2014-10-16 03:02:39 -07:00			`text = statements.join("\n")`
			`mention_text = mentions.join("\n")`
Github time! 2013-11-08 06:02:05 +11:00
Memory optimization 2014-10-16 03:02:39 -07:00			`lines = nil; statements = nil; mentions = nil # Allow garbage collection`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00
Memory optimization 2014-10-16 03:02:39 -07:00			`log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"`
Github time! 2013-11-08 06:02:05 +11:00
Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`@sentences = mass_tikify(text)`
			`@mentions = mass_tikify(mention_text)`
Github time! 2013-11-08 06:02:05 +11:00
More memory profiling 2014-10-18 22:21:50 -07:00			`log "Ranking keywords"`
Test that models save and load correctly 2014-10-25 06:59:34 -07:00			`@keywords = NLP.keywords(text).top(200).map(&:to_s)`
Github time! 2013-11-08 06:02:05 +11:00
			`self`
			`end`

			`def fix(tweet)`
			`# This seems to require an external api call`
			`#begin`
			`# fixer = NLP.gingerice.parse(tweet)`
			`# log fixer if fixer['corrections']`
			`# tweet = fixer['result']`
			`#rescue Exception => e`
			`# log e.message`
			`# log e.backtrace`
			`#end`

			`NLP.htmlentities.decode tweet`
			`end`

Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`def valid_tweet?(tikis, limit)`
			`tweet = NLP.reconstruct(tikis, @tokens)`
2.0.8 -- different generation algorithm 2013-11-14 07:44:05 -08:00			`tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)`
			`end`

Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`def make_statement(limit=140, generator=nil, retry_limit=10)`
2.0.8 -- different generation algorithm 2013-11-14 07:44:05 -08:00			`responding = !generator.nil?`
2.0.9 - Whups, broke context 2013-11-14 10:19:08 -08:00			`generator \|\|= SuffixGenerator.build(@sentences)`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00
			`retries = 0`
Github time! 2013-11-08 06:02:05 +11:00			`tweet = ""`

Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`while (tikis = generator.generate(3, :bigrams)) do`
			`next if tikis.length <= 3 && !responding`
			`break if valid_tweet?(tikis, limit)`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00
			`retries += 1`
			`break if retries >= retry_limit`
2.0.8 -- different generation algorithm 2013-11-14 07:44:05 -08:00			`end`

Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident`
			`while (tikis = generator.generate(3, :unigrams)) do`
			`break if valid_tweet?(tikis, limit) && !verbatim?(tikis)`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00
			`retries += 1`
			`break if retries >= retry_limit`
2.0.8 -- different generation algorithm 2013-11-14 07:44:05 -08:00			`end`
Github time! 2013-11-08 06:02:05 +11:00			`end`

Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`tweet = NLP.reconstruct(tikis, @tokens)`
2.0.8 -- different generation algorithm 2013-11-14 07:44:05 -08:00
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`if retries >= retry_limit`
			`log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""`
			`end`

Github time! 2013-11-08 06:02:05 +11:00			`fix tweet`
			`end`

Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`# Test if a sentence has been copied verbatim from original`
			`def verbatim?(tokens)`
			`@sentences.include?(tokens) \|\| @mentions.include?(tokens)`
			`end`

Github time! 2013-11-08 06:02:05 +11:00			`# Finds all relevant tokenized sentences to given input by`
			`# comparing non-stopword token overlaps`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`def find_relevant(sentences, input)`
Github time! 2013-11-08 06:02:05 +11:00			`relevant = []`
			`slightly_relevant = []`

Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`tokenized = NLP.tokenize(input).map(&:downcase)`
Github time! 2013-11-08 06:02:05 +11:00
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`sentences.each do \|sent\|`
Github time! 2013-11-08 06:02:05 +11:00			`tokenized.each do \|token\|`
Switch to using token indexes instead of strings 2014-10-24 09:55:49 -07:00			`if sent.map { \|tiki\| @tokens[tiki].downcase }.include?(token)`
Github time! 2013-11-08 06:02:05 +11:00			`relevant << sent unless NLP.stopword?(token)`
			`slightly_relevant << sent`
			`end`
			`end`
			`end`

			`[relevant, slightly_relevant]`
			`end`

			`# Generates a response by looking for related sentences`
2.0.8 -- different generation algorithm 2013-11-14 07:44:05 -08:00			`# in the corpus and building a smaller generator from these`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`def make_response(input, limit=140, sentences=@mentions)`
			`# Prefer mentions`
			`relevant, slightly_relevant = find_relevant(sentences, input)`
Github time! 2013-11-08 06:02:05 +11:00
			`if relevant.length >= 3`
2.0.8 -- different generation algorithm 2013-11-14 07:44:05 -08:00			`generator = SuffixGenerator.build(relevant)`
			`make_statement(limit, generator)`
			`elsif slightly_relevant.length >= 5`
			`generator = SuffixGenerator.build(slightly_relevant)`
			`make_statement(limit, generator)`
Retry limit and mention separation 2013-11-18 02:59:15 -08:00			`elsif sentences.equal?(@mentions)`
			`make_response(input, limit, @sentences)`
Github time! 2013-11-08 06:02:05 +11:00			`else`
2.0.8 -- different generation algorithm 2013-11-14 07:44:05 -08:00			`make_statement(limit)`
Github time! 2013-11-08 06:02:05 +11:00			`end`
			`end`
			`end`
			`end`