From be6ac9127f34929af2f8dbaf97c2da30032b16f8 Mon Sep 17 00:00:00 2001 From: Joel McCoy Date: Fri, 27 Jun 2014 18:42:51 -0400 Subject: [PATCH] MODEL: Read in utf-8, only parse CSV once Ran into `Encoding::CompatibilityError` issue trying to consume my corpus (tweets.csv) on Windows 7, but this likely affects other environments as well. Fix: force reading corpus file contents as utf-8. Also a quick clean-up of the CSV flow to only parse the content once instead of double-dipping. --- lib/twitter_ebooks/model.rb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/twitter_ebooks/model.rb b/lib/twitter_ebooks/model.rb index b625046..5619c93 100644 --- a/lib/twitter_ebooks/model.rb +++ b/lib/twitter_ebooks/model.rb @@ -19,7 +19,7 @@ module Ebooks end def consume(path) - content = File.read(path) + content = File.read(path, :encoding => 'utf-8') @hash = Digest::MD5.hexdigest(content) if path.split('.')[-1] == "json" @@ -29,9 +29,10 @@ module Ebooks end elsif path.split('.')[-1] == "csv" log "Reading CSV corpus from #{path}" - header = CSV.read(path).first + content = CSV.parse(content) + header = content.shift text_col = header.index('text') - lines = CSV.read(path).drop(1).map do |tweet| + lines = content.map do |tweet| tweet[text_col] end else