Memory optimization
This commit is contained in:
parent
d09d968915
commit
b7f67ec0a6
8 changed files with 203990 additions and 38 deletions
1
.rspec
Normal file
1
.rspec
Normal file
|
@ -0,0 +1 @@
|
||||||
|
--color
|
|
@ -1,5 +1,3 @@
|
||||||
gem 'minitest'
|
|
||||||
|
|
||||||
$debug = false
|
$debug = false
|
||||||
|
|
||||||
def log(*args)
|
def log(*args)
|
||||||
|
|
|
@ -18,14 +18,28 @@ module Ebooks
|
||||||
Marshal.load(File.open(path, 'rb') { |f| f.read })
|
Marshal.load(File.open(path, 'rb') { |f| f.read })
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def mass_tokenize(text)
|
||||||
|
sentences = NLP.sentences(text)
|
||||||
|
tokens = []
|
||||||
|
|
||||||
|
sentences.each do |s|
|
||||||
|
tokens << NLP.tokenize(s).reject do |t|
|
||||||
|
# Don't include usernames/urls as tokens
|
||||||
|
t.include?('@') || t.include?('http')
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
tokens
|
||||||
|
end
|
||||||
|
|
||||||
def consume(path)
|
def consume(path)
|
||||||
content = File.read(path, :encoding => 'utf-8')
|
content = File.read(path, :encoding => 'utf-8')
|
||||||
@hash = Digest::MD5.hexdigest(content)
|
@hash = Digest::MD5.hexdigest(content)
|
||||||
|
|
||||||
if path.split('.')[-1] == "json"
|
if path.split('.')[-1] == "json"
|
||||||
log "Reading json corpus from #{path}"
|
log "Reading json corpus from #{path}"
|
||||||
lines = JSON.parse(content, symbolize_names: true).map do |tweet|
|
lines = JSON.parse(content).map do |tweet|
|
||||||
tweet[:text]
|
tweet['text']
|
||||||
end
|
end
|
||||||
elsif path.split('.')[-1] == "csv"
|
elsif path.split('.')[-1] == "csv"
|
||||||
log "Reading CSV corpus from #{path}"
|
log "Reading CSV corpus from #{path}"
|
||||||
|
@ -42,44 +56,31 @@ module Ebooks
|
||||||
|
|
||||||
log "Removing commented lines and sorting mentions"
|
log "Removing commented lines and sorting mentions"
|
||||||
|
|
||||||
keeping = []
|
statements = []
|
||||||
mentions = []
|
mentions = []
|
||||||
lines.each do |l|
|
lines.each do |l|
|
||||||
next if l.start_with?('#') # Remove commented lines
|
next if l.start_with?('#') # Remove commented lines
|
||||||
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
||||||
|
|
||||||
if l.include?('@')
|
if l.include?('@')
|
||||||
mentions << l
|
statements << NLP.normalize(l)
|
||||||
else
|
else
|
||||||
keeping << l
|
mentions << NLP.normalize(l)
|
||||||
end
|
|
||||||
end
|
|
||||||
text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
|
|
||||||
mention_text = NLP.normalize(mentions.join("\n"))
|
|
||||||
|
|
||||||
log "Segmenting text into sentences"
|
|
||||||
|
|
||||||
statements = NLP.sentences(text)
|
|
||||||
mentions = NLP.sentences(mention_text)
|
|
||||||
|
|
||||||
log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
|
|
||||||
@sentences = []
|
|
||||||
@mentions = []
|
|
||||||
|
|
||||||
statements.each do |s|
|
|
||||||
@sentences << NLP.tokenize(s).reject do |t|
|
|
||||||
t.include?('@') || t.include?('http')
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
mentions.each do |s|
|
text = statements.join("\n")
|
||||||
@mentions << NLP.tokenize(s).reject do |t|
|
mention_text = mentions.join("\n")
|
||||||
t.include?('@') || t.include?('http')
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
log "Ranking keywords"
|
lines = nil; statements = nil; mentions = nil # Allow garbage collection
|
||||||
@keywords = NLP.keywords(@sentences)
|
|
||||||
|
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
|
||||||
|
|
||||||
|
@sentences = mass_tokenize(text)
|
||||||
|
@mentions = mass_tokenize(mention_text)
|
||||||
|
|
||||||
|
#log "Ranking keywords"
|
||||||
|
#@keywords = NLP.keywords(@sentences)
|
||||||
|
|
||||||
self
|
self
|
||||||
end
|
end
|
||||||
|
|
|
@ -43,7 +43,7 @@ module Ebooks
|
||||||
end
|
end
|
||||||
|
|
||||||
### Utility functions
|
### Utility functions
|
||||||
|
|
||||||
# We don't really want to deal with all this weird unicode punctuation
|
# We don't really want to deal with all this weird unicode punctuation
|
||||||
def self.normalize(text)
|
def self.normalize(text)
|
||||||
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
||||||
|
|
203945
spec/data/0xabad1dea.json
Normal file
203945
spec/data/0xabad1dea.json
Normal file
File diff suppressed because it is too large
Load diff
Binary file not shown.
|
@ -1,8 +1,14 @@
|
||||||
require 'spec_helper'
|
require 'spec_helper'
|
||||||
require 'objspace'
|
require 'memory_profiler'
|
||||||
|
|
||||||
describe Ebooks::Model do
|
describe Ebooks::Model do
|
||||||
it "does stuff" do
|
it "does not use a ridiculous amount of memory" do
|
||||||
model = Ebooks::Model.load(path("data/0xabad1dea.model"))
|
RubyProf.start
|
||||||
|
Ebooks::Model.consume(path("data/0xabad1dea.json"))
|
||||||
|
result = RubyProf.stop
|
||||||
|
|
||||||
|
require 'pry'; binding.pry
|
||||||
|
|
||||||
|
expect(report.total_retained).to be < 100000
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -16,11 +16,12 @@ Gem::Specification.new do |gem|
|
||||||
gem.version = Ebooks::VERSION
|
gem.version = Ebooks::VERSION
|
||||||
|
|
||||||
gem.add_development_dependency 'rspec'
|
gem.add_development_dependency 'rspec'
|
||||||
gem.add_development_dependency 'memory_profiler'
|
gem.add_development_dependency 'ruby-prof'
|
||||||
|
gem.add_development_dependency 'pry-byebug'
|
||||||
|
|
||||||
|
|
||||||
gem.add_runtime_dependency 'twitter', '~> 4.5'
|
gem.add_runtime_dependency 'twitter', '~> 5.1'
|
||||||
gem.add_runtime_dependency 'tweetstream', '= 2.5'
|
gem.add_runtime_dependency 'tweetstream'
|
||||||
gem.add_runtime_dependency 'rufus-scheduler'
|
gem.add_runtime_dependency 'rufus-scheduler'
|
||||||
gem.add_runtime_dependency 'gingerice'
|
gem.add_runtime_dependency 'gingerice'
|
||||||
gem.add_runtime_dependency 'htmlentities'
|
gem.add_runtime_dependency 'htmlentities'
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue