Memory optimization

2014-10-16 03:02:39 -07:00 · 2014-10-16 03:02:39 -07:00 · b7f67ec0a6
commit b7f67ec0a6
parent d09d968915
8 changed files with 203990 additions and 38 deletions
--- a/.rspec
+++ b/.rspec
@ -0,0 +1 @@
 --color
--- a/lib/twitter_ebooks.rb
+++ b/lib/twitter_ebooks.rb
@ -1,5 +1,3 @@
 gem 'minitest'
 $debug = false
 def log(*args)
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -18,14 +18,28 @@ module Ebooks
      Marshal.load(File.open(path, 'rb') { |f| f.read })
    end
    def mass_tokenize(text)
      sentences = NLP.sentences(text)
      tokens = []
      sentences.each do |s|
        tokens << NLP.tokenize(s).reject do |t|
          # Don't include usernames/urls as tokens
          t.include?('@') || t.include?('http')
        end
      end
      tokens
    end
    def consume(path)
      content = File.read(path, :encoding => 'utf-8')
      @hash = Digest::MD5.hexdigest(content)
      if path.split('.')[-1] == "json"
        log "Reading json corpus from #{path}"
-        lines = JSON.parse(content, symbolize_names: true).map do |tweet|
+        lines = JSON.parse(content).map do |tweet|
-          tweet[:text]
+          tweet['text']
        end
      elsif path.split('.')[-1] == "csv"
        log "Reading CSV corpus from #{path}"
@ -42,44 +56,31 @@ module Ebooks
      log "Removing commented lines and sorting mentions"
-      keeping = []
+      statements = []
      mentions = []
      lines.each do |l|
        next if l.start_with?('#') # Remove commented lines
        next if l.include?('RT') || l.include?('MT') # Remove soft retweets
        if l.include?('@')
-          mentions << l
+          statements << NLP.normalize(l)
        else
-          keeping << l
+          mentions << NLP.normalize(l)
        end
      end
      text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
      mention_text = NLP.normalize(mentions.join("\n"))
      log "Segmenting text into sentences"
      statements = NLP.sentences(text)
      mentions = NLP.sentences(mention_text)
      log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
      @sentences = []
      @mentions = []
      statements.each do |s|
        @sentences << NLP.tokenize(s).reject do |t|
          t.include?('@') || t.include?('http')
        end
      end
-      mentions.each do |s|
+      text = statements.join("\n")
-        @mentions << NLP.tokenize(s).reject do |t|
+      mention_text = mentions.join("\n")
          t.include?('@') || t.include?('http')
        end
      end
-      log "Ranking keywords"
+      lines = nil; statements = nil; mentions = nil # Allow garbage collection
-      @keywords = NLP.keywords(@sentences)
+
      log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
      @sentences = mass_tokenize(text)
      @mentions = mass_tokenize(mention_text)
      #log "Ranking keywords"
      #@keywords = NLP.keywords(@sentences)
      self
    end
--- a/lib/twitter_ebooks/nlp.rb
+++ b/lib/twitter_ebooks/nlp.rb
@ -43,7 +43,7 @@ module Ebooks
    end
    ### Utility functions
-    
+
    # We don't really want to deal with all this weird unicode punctuation
    def self.normalize(text)
      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
--- a/spec/data/0xabad1dea.json
+++ b/spec/data/0xabad1dea.json
--- a/spec/data/0xabad1dea.model
+++ b/spec/data/0xabad1dea.model
--- a/spec/model_spec.rb
+++ b/spec/model_spec.rb
@ -1,8 +1,14 @@
 require 'spec_helper'
-require 'objspace'
+require 'memory_profiler'
 describe Ebooks::Model do
-  it "does stuff" do
+  it "does not use a ridiculous amount of memory" do
-    model = Ebooks::Model.load(path("data/0xabad1dea.model"))
+    RubyProf.start
    Ebooks::Model.consume(path("data/0xabad1dea.json"))
    result = RubyProf.stop
    require 'pry'; binding.pry
    expect(report.total_retained).to be < 100000
  end
 end
--- a/twitter_ebooks.gemspec
+++ b/twitter_ebooks.gemspec
@ -16,11 +16,12 @@ Gem::Specification.new do |gem|
  gem.version       = Ebooks::VERSION
  gem.add_development_dependency 'rspec'
-  gem.add_development_dependency 'memory_profiler'
+  gem.add_development_dependency 'ruby-prof'
  gem.add_development_dependency 'pry-byebug'
-  gem.add_runtime_dependency 'twitter', '~> 4.5'
+  gem.add_runtime_dependency 'twitter', '~> 5.1'
-  gem.add_runtime_dependency 'tweetstream', '= 2.5'
+  gem.add_runtime_dependency 'tweetstream'
  gem.add_runtime_dependency 'rufus-scheduler'
  gem.add_runtime_dependency 'gingerice'
  gem.add_runtime_dependency 'htmlentities'