Memory optimization

2014-10-16 03:02:39 -07:00 · 2014-10-16 03:02:39 -07:00 · b7f67ec0a6
commit b7f67ec0a6
parent d09d968915
8 changed files with 203990 additions and 38 deletions
--- a/.rspec
+++ b/.rspec
@ -0,0 +1 @@
+--color
--- a/lib/twitter_ebooks.rb
+++ b/lib/twitter_ebooks.rb
@ -1,5 +1,3 @@
-gem 'minitest'
-
 $debug = false

 def log(*args)
--- a/lib/twitter_ebooks/model.rb
+++ b/lib/twitter_ebooks/model.rb
@ -18,14 +18,28 @@ module Ebooks
      Marshal.load(File.open(path, 'rb') { |f| f.read })
    end

+    def mass_tokenize(text)
+      sentences = NLP.sentences(text)
+      tokens = []
+
+      sentences.each do |s|
+        tokens << NLP.tokenize(s).reject do |t|
+          # Don't include usernames/urls as tokens
+          t.include?('@') || t.include?('http')
+        end
+      end
+
+      tokens
+    end
+
    def consume(path)
      content = File.read(path, :encoding => 'utf-8')
      @hash = Digest::MD5.hexdigest(content)

      if path.split('.')[-1] == "json"
        log "Reading json corpus from #{path}"
-        lines = JSON.parse(content, symbolize_names: true).map do |tweet|
-          tweet[:text]
+        lines = JSON.parse(content).map do |tweet|
+          tweet['text']
        end
      elsif path.split('.')[-1] == "csv"
        log "Reading CSV corpus from #{path}"
@ -42,44 +56,31 @@ module Ebooks

      log "Removing commented lines and sorting mentions"

-      keeping = []
+      statements = []
      mentions = []
      lines.each do |l|
        next if l.start_with?('#') # Remove commented lines
        next if l.include?('RT') || l.include?('MT') # Remove soft retweets

        if l.include?('@')
-          mentions << l
+          statements << NLP.normalize(l)
        else
-          keeping << l
-        end
-      end
-      text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
-      mention_text = NLP.normalize(mentions.join("\n"))
-
-      log "Segmenting text into sentences"
-
-      statements = NLP.sentences(text)
-      mentions = NLP.sentences(mention_text)
-
-      log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
-      @sentences = []
-      @mentions = []
-
-      statements.each do |s|
-        @sentences << NLP.tokenize(s).reject do |t|
-          t.include?('@') || t.include?('http')
+          mentions << NLP.normalize(l)
        end
      end

-      mentions.each do |s|
-        @mentions << NLP.tokenize(s).reject do |t|
-          t.include?('@') || t.include?('http')
-        end
-      end
+      text = statements.join("\n")
+      mention_text = mentions.join("\n")

-      log "Ranking keywords"
-      @keywords = NLP.keywords(@sentences)
+      lines = nil; statements = nil; mentions = nil # Allow garbage collection
+
+      log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
+
+      @sentences = mass_tokenize(text)
+      @mentions = mass_tokenize(mention_text)
+
+      #log "Ranking keywords"
+      #@keywords = NLP.keywords(@sentences)

      self
    end
--- a/spec/data/0xabad1dea.json
+++ b/spec/data/0xabad1dea.json
--- a/spec/data/0xabad1dea.model
+++ b/spec/data/0xabad1dea.model
--- a/spec/model_spec.rb
+++ b/spec/model_spec.rb
@ -1,8 +1,14 @@
 require 'spec_helper'
-require 'objspace'
+require 'memory_profiler'

 describe Ebooks::Model do
-  it "does stuff" do
-    model = Ebooks::Model.load(path("data/0xabad1dea.model"))
+  it "does not use a ridiculous amount of memory" do
+    RubyProf.start
+    Ebooks::Model.consume(path("data/0xabad1dea.json"))
+    result = RubyProf.stop
+
+    require 'pry'; binding.pry
+
+    expect(report.total_retained).to be < 100000
  end
 end
--- a/twitter_ebooks.gemspec
+++ b/twitter_ebooks.gemspec
@ -16,11 +16,12 @@ Gem::Specification.new do |gem|
  gem.version       = Ebooks::VERSION

  gem.add_development_dependency 'rspec'
-  gem.add_development_dependency 'memory_profiler'
+  gem.add_development_dependency 'ruby-prof'
+  gem.add_development_dependency 'pry-byebug'


-  gem.add_runtime_dependency 'twitter', '~> 4.5'
-  gem.add_runtime_dependency 'tweetstream', '= 2.5'
+  gem.add_runtime_dependency 'twitter', '~> 5.1'
+  gem.add_runtime_dependency 'tweetstream'
  gem.add_runtime_dependency 'rufus-scheduler'
  gem.add_runtime_dependency 'gingerice'
  gem.add_runtime_dependency 'htmlentities'