Memory optimization

This commit is contained in:
Jaiden Mispy 2014-10-16 03:02:39 -07:00
parent d09d968915
commit b7f67ec0a6
8 changed files with 203990 additions and 38 deletions

1
.rspec Normal file
View file

@ -0,0 +1 @@
--color

View file

@ -1,5 +1,3 @@
gem 'minitest'
$debug = false
def log(*args)

View file

@ -18,14 +18,28 @@ module Ebooks
Marshal.load(File.open(path, 'rb') { |f| f.read })
end
def mass_tokenize(text)
sentences = NLP.sentences(text)
tokens = []
sentences.each do |s|
tokens << NLP.tokenize(s).reject do |t|
# Don't include usernames/urls as tokens
t.include?('@') || t.include?('http')
end
end
tokens
end
def consume(path)
content = File.read(path, :encoding => 'utf-8')
@hash = Digest::MD5.hexdigest(content)
if path.split('.')[-1] == "json"
log "Reading json corpus from #{path}"
lines = JSON.parse(content, symbolize_names: true).map do |tweet|
tweet[:text]
lines = JSON.parse(content).map do |tweet|
tweet['text']
end
elsif path.split('.')[-1] == "csv"
log "Reading CSV corpus from #{path}"
@ -42,44 +56,31 @@ module Ebooks
log "Removing commented lines and sorting mentions"
keeping = []
statements = []
mentions = []
lines.each do |l|
next if l.start_with?('#') # Remove commented lines
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
if l.include?('@')
mentions << l
statements << NLP.normalize(l)
else
keeping << l
end
end
text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
mention_text = NLP.normalize(mentions.join("\n"))
log "Segmenting text into sentences"
statements = NLP.sentences(text)
mentions = NLP.sentences(mention_text)
log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
@sentences = []
@mentions = []
statements.each do |s|
@sentences << NLP.tokenize(s).reject do |t|
t.include?('@') || t.include?('http')
mentions << NLP.normalize(l)
end
end
mentions.each do |s|
@mentions << NLP.tokenize(s).reject do |t|
t.include?('@') || t.include?('http')
end
end
text = statements.join("\n")
mention_text = mentions.join("\n")
log "Ranking keywords"
@keywords = NLP.keywords(@sentences)
lines = nil; statements = nil; mentions = nil # Allow garbage collection
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
@sentences = mass_tokenize(text)
@mentions = mass_tokenize(mention_text)
#log "Ranking keywords"
#@keywords = NLP.keywords(@sentences)
self
end

203945
spec/data/0xabad1dea.json Normal file

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -1,8 +1,14 @@
require 'spec_helper'
require 'objspace'
require 'memory_profiler'
describe Ebooks::Model do
it "does stuff" do
model = Ebooks::Model.load(path("data/0xabad1dea.model"))
it "does not use a ridiculous amount of memory" do
RubyProf.start
Ebooks::Model.consume(path("data/0xabad1dea.json"))
result = RubyProf.stop
require 'pry'; binding.pry
expect(report.total_retained).to be < 100000
end
end

View file

@ -16,11 +16,12 @@ Gem::Specification.new do |gem|
gem.version = Ebooks::VERSION
gem.add_development_dependency 'rspec'
gem.add_development_dependency 'memory_profiler'
gem.add_development_dependency 'ruby-prof'
gem.add_development_dependency 'pry-byebug'
gem.add_runtime_dependency 'twitter', '~> 4.5'
gem.add_runtime_dependency 'tweetstream', '= 2.5'
gem.add_runtime_dependency 'twitter', '~> 5.1'
gem.add_runtime_dependency 'tweetstream'
gem.add_runtime_dependency 'rufus-scheduler'
gem.add_runtime_dependency 'gingerice'
gem.add_runtime_dependency 'htmlentities'