Memory optimization

This commit is contained in:
Jaiden Mispy 2014-10-16 03:02:39 -07:00
parent d09d968915
commit b7f67ec0a6
8 changed files with 203990 additions and 38 deletions

1
.rspec Normal file
View file

@ -0,0 +1 @@
--color

View file

@ -1,5 +1,3 @@
gem 'minitest'
$debug = false $debug = false
def log(*args) def log(*args)

View file

@ -18,14 +18,28 @@ module Ebooks
Marshal.load(File.open(path, 'rb') { |f| f.read }) Marshal.load(File.open(path, 'rb') { |f| f.read })
end end
def mass_tokenize(text)
sentences = NLP.sentences(text)
tokens = []
sentences.each do |s|
tokens << NLP.tokenize(s).reject do |t|
# Don't include usernames/urls as tokens
t.include?('@') || t.include?('http')
end
end
tokens
end
def consume(path) def consume(path)
content = File.read(path, :encoding => 'utf-8') content = File.read(path, :encoding => 'utf-8')
@hash = Digest::MD5.hexdigest(content) @hash = Digest::MD5.hexdigest(content)
if path.split('.')[-1] == "json" if path.split('.')[-1] == "json"
log "Reading json corpus from #{path}" log "Reading json corpus from #{path}"
lines = JSON.parse(content, symbolize_names: true).map do |tweet| lines = JSON.parse(content).map do |tweet|
tweet[:text] tweet['text']
end end
elsif path.split('.')[-1] == "csv" elsif path.split('.')[-1] == "csv"
log "Reading CSV corpus from #{path}" log "Reading CSV corpus from #{path}"
@ -42,44 +56,31 @@ module Ebooks
log "Removing commented lines and sorting mentions" log "Removing commented lines and sorting mentions"
keeping = [] statements = []
mentions = [] mentions = []
lines.each do |l| lines.each do |l|
next if l.start_with?('#') # Remove commented lines next if l.start_with?('#') # Remove commented lines
next if l.include?('RT') || l.include?('MT') # Remove soft retweets next if l.include?('RT') || l.include?('MT') # Remove soft retweets
if l.include?('@') if l.include?('@')
mentions << l statements << NLP.normalize(l)
else else
keeping << l mentions << NLP.normalize(l)
end
end
text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
mention_text = NLP.normalize(mentions.join("\n"))
log "Segmenting text into sentences"
statements = NLP.sentences(text)
mentions = NLP.sentences(mention_text)
log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
@sentences = []
@mentions = []
statements.each do |s|
@sentences << NLP.tokenize(s).reject do |t|
t.include?('@') || t.include?('http')
end end
end end
mentions.each do |s| text = statements.join("\n")
@mentions << NLP.tokenize(s).reject do |t| mention_text = mentions.join("\n")
t.include?('@') || t.include?('http')
end
end
log "Ranking keywords" lines = nil; statements = nil; mentions = nil # Allow garbage collection
@keywords = NLP.keywords(@sentences)
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
@sentences = mass_tokenize(text)
@mentions = mass_tokenize(mention_text)
#log "Ranking keywords"
#@keywords = NLP.keywords(@sentences)
self self
end end

View file

@ -43,7 +43,7 @@ module Ebooks
end end
### Utility functions ### Utility functions
# We don't really want to deal with all this weird unicode punctuation # We don't really want to deal with all this weird unicode punctuation
def self.normalize(text) def self.normalize(text)
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('', "'").gsub('…', '...') htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('', "'").gsub('…', '...')

203945
spec/data/0xabad1dea.json Normal file

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -1,8 +1,14 @@
require 'spec_helper' require 'spec_helper'
require 'objspace' require 'memory_profiler'
describe Ebooks::Model do describe Ebooks::Model do
it "does stuff" do it "does not use a ridiculous amount of memory" do
model = Ebooks::Model.load(path("data/0xabad1dea.model")) RubyProf.start
Ebooks::Model.consume(path("data/0xabad1dea.json"))
result = RubyProf.stop
require 'pry'; binding.pry
expect(report.total_retained).to be < 100000
end end
end end

View file

@ -16,11 +16,12 @@ Gem::Specification.new do |gem|
gem.version = Ebooks::VERSION gem.version = Ebooks::VERSION
gem.add_development_dependency 'rspec' gem.add_development_dependency 'rspec'
gem.add_development_dependency 'memory_profiler' gem.add_development_dependency 'ruby-prof'
gem.add_development_dependency 'pry-byebug'
gem.add_runtime_dependency 'twitter', '~> 4.5' gem.add_runtime_dependency 'twitter', '~> 5.1'
gem.add_runtime_dependency 'tweetstream', '= 2.5' gem.add_runtime_dependency 'tweetstream'
gem.add_runtime_dependency 'rufus-scheduler' gem.add_runtime_dependency 'rufus-scheduler'
gem.add_runtime_dependency 'gingerice' gem.add_runtime_dependency 'gingerice'
gem.add_runtime_dependency 'htmlentities' gem.add_runtime_dependency 'htmlentities'