On second thought, we can't use a cache system

Simply because the corpuses are too darn big to keep around
This commit is contained in:
Jaiden Mispy 2014-11-18 13:51:31 +11:00
parent 8135aaaabb
commit 2e336fb9be

View file

@ -4,33 +4,14 @@
require 'json'
require 'set'
require 'digest/md5'
require 'fileutils'
require 'csv'
module Ebooks
class Model
attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
# Consume a corpus file to create a model
# @param corpus_path Path to a json, text or csv file to consume
# @param cache Optional path to a directory to store cached models
def self.consume(corpus_path, cache: nil)
if cache
FileUtils::mkdir_p cache
cache_path = File.join(cache, Digest::MD5.file(corpus_path).to_s)
if File.exists?(cache_path)
log "Reading model from cache at #{cache_path}"
return Model.load(cache_path)
end
end
model = Model.new.consume(corpus_path)
if cache
log "Caching model at #{cache_path}"
model.save(cache_path)
end
def self.consume(txtpath)
Model.new.consume(txtpath)
end
def self.consume_all(paths)