On second thought, we can't use a cache system
Simply because the corpuses are too darn big to keep around
This commit is contained in:
		
							parent
							
								
									8135aaaabb
								
							
						
					
					
						commit
						2e336fb9be
					
				
					 1 changed files with 2 additions and 21 deletions
				
			
		| 
						 | 
				
			
			@ -4,33 +4,14 @@
 | 
			
		|||
require 'json'
 | 
			
		||||
require 'set'
 | 
			
		||||
require 'digest/md5'
 | 
			
		||||
require 'fileutils'
 | 
			
		||||
require 'csv'
 | 
			
		||||
 | 
			
		||||
module Ebooks
 | 
			
		||||
  class Model
 | 
			
		||||
    attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
 | 
			
		||||
 | 
			
		||||
    # Consume a corpus file to create a model
 | 
			
		||||
    # @param corpus_path Path to a json, text or csv file to consume
 | 
			
		||||
    # @param cache Optional path to a directory to store cached models
 | 
			
		||||
    def self.consume(corpus_path, cache: nil)
 | 
			
		||||
      if cache
 | 
			
		||||
        FileUtils::mkdir_p cache
 | 
			
		||||
 | 
			
		||||
        cache_path = File.join(cache, Digest::MD5.file(corpus_path).to_s)
 | 
			
		||||
        if File.exists?(cache_path)
 | 
			
		||||
          log "Reading model from cache at #{cache_path}"
 | 
			
		||||
          return Model.load(cache_path)
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      model = Model.new.consume(corpus_path)
 | 
			
		||||
 | 
			
		||||
      if cache
 | 
			
		||||
        log "Caching model at #{cache_path}"
 | 
			
		||||
        model.save(cache_path)
 | 
			
		||||
      end
 | 
			
		||||
    def self.consume(txtpath)
 | 
			
		||||
      Model.new.consume(txtpath)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def self.consume_all(paths)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue