Memory optimization
This commit is contained in:
		
							parent
							
								
									d09d968915
								
							
						
					
					
						commit
						b7f67ec0a6
					
				
					 8 changed files with 203990 additions and 38 deletions
				
			
		
							
								
								
									
										1
									
								
								.rspec
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.rspec
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | |||
| --color | ||||
|  | @ -1,5 +1,3 @@ | |||
| gem 'minitest' | ||||
| 
 | ||||
| $debug = false | ||||
| 
 | ||||
| def log(*args) | ||||
|  |  | |||
|  | @ -18,14 +18,28 @@ module Ebooks | |||
|       Marshal.load(File.open(path, 'rb') { |f| f.read }) | ||||
|     end | ||||
| 
 | ||||
|     def mass_tokenize(text) | ||||
|       sentences = NLP.sentences(text) | ||||
|       tokens = [] | ||||
| 
 | ||||
|       sentences.each do |s| | ||||
|         tokens << NLP.tokenize(s).reject do |t| | ||||
|           # Don't include usernames/urls as tokens | ||||
|           t.include?('@') || t.include?('http') | ||||
|         end | ||||
|       end | ||||
| 
 | ||||
|       tokens | ||||
|     end | ||||
| 
 | ||||
|     def consume(path) | ||||
|       content = File.read(path, :encoding => 'utf-8') | ||||
|       @hash = Digest::MD5.hexdigest(content) | ||||
| 
 | ||||
|       if path.split('.')[-1] == "json" | ||||
|         log "Reading json corpus from #{path}" | ||||
|         lines = JSON.parse(content, symbolize_names: true).map do |tweet| | ||||
|           tweet[:text] | ||||
|         lines = JSON.parse(content).map do |tweet| | ||||
|           tweet['text'] | ||||
|         end | ||||
|       elsif path.split('.')[-1] == "csv" | ||||
|         log "Reading CSV corpus from #{path}" | ||||
|  | @ -42,44 +56,31 @@ module Ebooks | |||
| 
 | ||||
|       log "Removing commented lines and sorting mentions" | ||||
| 
 | ||||
|       keeping = [] | ||||
|       statements = [] | ||||
|       mentions = [] | ||||
|       lines.each do |l| | ||||
|         next if l.start_with?('#') # Remove commented lines | ||||
|         next if l.include?('RT') || l.include?('MT') # Remove soft retweets | ||||
| 
 | ||||
|         if l.include?('@') | ||||
|           mentions << l | ||||
|           statements << NLP.normalize(l) | ||||
|         else | ||||
|           keeping << l | ||||
|         end | ||||
|       end | ||||
|       text = NLP.normalize(keeping.join("\n")) # Normalize weird characters | ||||
|       mention_text = NLP.normalize(mentions.join("\n")) | ||||
| 
 | ||||
|       log "Segmenting text into sentences" | ||||
| 
 | ||||
|       statements = NLP.sentences(text) | ||||
|       mentions = NLP.sentences(mention_text) | ||||
| 
 | ||||
|       log "Tokenizing #{statements.length} statements and #{mentions.length} mentions" | ||||
|       @sentences = [] | ||||
|       @mentions = [] | ||||
| 
 | ||||
|       statements.each do |s| | ||||
|         @sentences << NLP.tokenize(s).reject do |t| | ||||
|           t.include?('@') || t.include?('http') | ||||
|           mentions << NLP.normalize(l) | ||||
|         end | ||||
|       end | ||||
| 
 | ||||
|       mentions.each do |s| | ||||
|         @mentions << NLP.tokenize(s).reject do |t| | ||||
|           t.include?('@') || t.include?('http') | ||||
|         end | ||||
|       end | ||||
|       text = statements.join("\n") | ||||
|       mention_text = mentions.join("\n") | ||||
| 
 | ||||
|       log "Ranking keywords" | ||||
|       @keywords = NLP.keywords(@sentences) | ||||
|       lines = nil; statements = nil; mentions = nil # Allow garbage collection | ||||
| 
 | ||||
|       log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" | ||||
| 
 | ||||
|       @sentences = mass_tokenize(text) | ||||
|       @mentions = mass_tokenize(mention_text) | ||||
| 
 | ||||
|       #log "Ranking keywords" | ||||
|       #@keywords = NLP.keywords(@sentences) | ||||
| 
 | ||||
|       self | ||||
|     end | ||||
|  |  | |||
|  | @ -43,7 +43,7 @@ module Ebooks | |||
|     end | ||||
| 
 | ||||
|     ### Utility functions | ||||
|      | ||||
| 
 | ||||
|     # We don't really want to deal with all this weird unicode punctuation | ||||
|     def self.normalize(text) | ||||
|       htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...') | ||||
|  |  | |||
							
								
								
									
										203945
									
								
								spec/data/0xabad1dea.json
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										203945
									
								
								spec/data/0xabad1dea.json
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							|  | @ -1,8 +1,14 @@ | |||
| require 'spec_helper' | ||||
| require 'objspace' | ||||
| require 'memory_profiler' | ||||
| 
 | ||||
| describe Ebooks::Model do | ||||
|   it "does stuff" do | ||||
|     model = Ebooks::Model.load(path("data/0xabad1dea.model")) | ||||
|   it "does not use a ridiculous amount of memory" do | ||||
|     RubyProf.start | ||||
|     Ebooks::Model.consume(path("data/0xabad1dea.json")) | ||||
|     result = RubyProf.stop | ||||
| 
 | ||||
|     require 'pry'; binding.pry | ||||
| 
 | ||||
|     expect(report.total_retained).to be < 100000 | ||||
|   end | ||||
| end | ||||
|  |  | |||
|  | @ -16,11 +16,12 @@ Gem::Specification.new do |gem| | |||
|   gem.version       = Ebooks::VERSION | ||||
| 
 | ||||
|   gem.add_development_dependency 'rspec' | ||||
|   gem.add_development_dependency 'memory_profiler' | ||||
|   gem.add_development_dependency 'ruby-prof' | ||||
|   gem.add_development_dependency 'pry-byebug' | ||||
| 
 | ||||
| 
 | ||||
|   gem.add_runtime_dependency 'twitter', '~> 4.5' | ||||
|   gem.add_runtime_dependency 'tweetstream', '= 2.5' | ||||
|   gem.add_runtime_dependency 'twitter', '~> 5.1' | ||||
|   gem.add_runtime_dependency 'tweetstream' | ||||
|   gem.add_runtime_dependency 'rufus-scheduler' | ||||
|   gem.add_runtime_dependency 'gingerice' | ||||
|   gem.add_runtime_dependency 'htmlentities' | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Jaiden Mispy
						Jaiden Mispy