2013-11-08 06:02:05 +11:00
|
|
|
#!/usr/bin/env ruby
|
|
|
|
# encoding: utf-8
|
|
|
|
|
|
|
|
require 'json'
|
|
|
|
require 'set'
|
|
|
|
require 'digest/md5'
|
2014-04-30 20:30:54 -04:00
|
|
|
require 'csv'
|
2013-11-08 06:02:05 +11:00
|
|
|
|
|
|
|
module Ebooks
|
|
|
|
class Model
|
2013-11-18 02:59:15 -08:00
|
|
|
attr_accessor :hash, :sentences, :mentions, :keywords
|
2013-11-08 06:02:05 +11:00
|
|
|
|
|
|
|
def self.consume(txtpath)
|
|
|
|
Model.new.consume(txtpath)
|
|
|
|
end
|
|
|
|
|
|
|
|
def self.load(path)
|
2014-01-28 16:36:23 -08:00
|
|
|
Marshal.load(File.open(path, 'rb') { |f| f.read })
|
2013-11-08 06:02:05 +11:00
|
|
|
end
|
|
|
|
|
2014-10-16 03:02:39 -07:00
|
|
|
def mass_tokenize(text)
|
|
|
|
sentences = NLP.sentences(text)
|
|
|
|
tokens = []
|
|
|
|
|
|
|
|
sentences.each do |s|
|
|
|
|
tokens << NLP.tokenize(s).reject do |t|
|
|
|
|
# Don't include usernames/urls as tokens
|
|
|
|
t.include?('@') || t.include?('http')
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
tokens
|
|
|
|
end
|
|
|
|
|
2013-11-27 05:12:54 -08:00
|
|
|
def consume(path)
|
2014-06-27 18:42:51 -04:00
|
|
|
content = File.read(path, :encoding => 'utf-8')
|
2013-11-27 05:12:54 -08:00
|
|
|
@hash = Digest::MD5.hexdigest(content)
|
|
|
|
|
|
|
|
if path.split('.')[-1] == "json"
|
|
|
|
log "Reading json corpus from #{path}"
|
2014-10-16 03:02:39 -07:00
|
|
|
lines = JSON.parse(content).map do |tweet|
|
|
|
|
tweet['text']
|
2013-11-27 05:12:54 -08:00
|
|
|
end
|
2014-04-30 20:30:54 -04:00
|
|
|
elsif path.split('.')[-1] == "csv"
|
|
|
|
log "Reading CSV corpus from #{path}"
|
2014-06-27 18:42:51 -04:00
|
|
|
content = CSV.parse(content)
|
|
|
|
header = content.shift
|
2014-05-03 16:44:07 -06:00
|
|
|
text_col = header.index('text')
|
2014-06-27 18:42:51 -04:00
|
|
|
lines = content.map do |tweet|
|
2014-05-03 16:44:07 -06:00
|
|
|
tweet[text_col]
|
2014-04-30 20:30:54 -04:00
|
|
|
end
|
2013-11-27 05:12:54 -08:00
|
|
|
else
|
|
|
|
log "Reading plaintext corpus from #{path}"
|
|
|
|
lines = content.split("\n")
|
|
|
|
end
|
2013-11-08 06:02:05 +11:00
|
|
|
|
2013-11-18 02:59:15 -08:00
|
|
|
log "Removing commented lines and sorting mentions"
|
2013-11-08 06:02:05 +11:00
|
|
|
|
2014-10-16 03:02:39 -07:00
|
|
|
statements = []
|
2013-11-18 02:59:15 -08:00
|
|
|
mentions = []
|
2013-11-08 06:02:05 +11:00
|
|
|
lines.each do |l|
|
2013-11-18 02:59:15 -08:00
|
|
|
next if l.start_with?('#') # Remove commented lines
|
|
|
|
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
|
2014-10-14 01:02:08 -07:00
|
|
|
|
2013-11-18 02:59:15 -08:00
|
|
|
if l.include?('@')
|
2014-10-16 03:02:39 -07:00
|
|
|
statements << NLP.normalize(l)
|
2013-11-18 02:59:15 -08:00
|
|
|
else
|
2014-10-16 03:02:39 -07:00
|
|
|
mentions << NLP.normalize(l)
|
2013-11-18 02:59:15 -08:00
|
|
|
end
|
2013-11-08 06:02:05 +11:00
|
|
|
end
|
|
|
|
|
2014-10-16 03:02:39 -07:00
|
|
|
text = statements.join("\n")
|
|
|
|
mention_text = mentions.join("\n")
|
2013-11-08 06:02:05 +11:00
|
|
|
|
2014-10-16 03:02:39 -07:00
|
|
|
lines = nil; statements = nil; mentions = nil # Allow garbage collection
|
2013-11-18 02:59:15 -08:00
|
|
|
|
2014-10-16 03:02:39 -07:00
|
|
|
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
|
2013-11-08 06:02:05 +11:00
|
|
|
|
2014-10-16 03:02:39 -07:00
|
|
|
@sentences = mass_tokenize(text)
|
|
|
|
@mentions = mass_tokenize(mention_text)
|
2013-11-08 06:02:05 +11:00
|
|
|
|
2014-10-18 22:21:50 -07:00
|
|
|
log "Ranking keywords"
|
|
|
|
@keywords = NLP.keywords(@sentences)
|
2013-11-08 06:02:05 +11:00
|
|
|
|
|
|
|
self
|
|
|
|
end
|
|
|
|
|
|
|
|
def save(path)
|
2014-01-28 16:36:23 -08:00
|
|
|
File.open(path, 'wb') do |f|
|
2013-11-08 06:02:05 +11:00
|
|
|
f.write(Marshal.dump(self))
|
|
|
|
end
|
|
|
|
self
|
|
|
|
end
|
|
|
|
|
|
|
|
def fix(tweet)
|
|
|
|
# This seems to require an external api call
|
|
|
|
#begin
|
|
|
|
# fixer = NLP.gingerice.parse(tweet)
|
|
|
|
# log fixer if fixer['corrections']
|
|
|
|
# tweet = fixer['result']
|
|
|
|
#rescue Exception => e
|
|
|
|
# log e.message
|
|
|
|
# log e.backtrace
|
|
|
|
#end
|
|
|
|
|
|
|
|
NLP.htmlentities.decode tweet
|
|
|
|
end
|
|
|
|
|
2013-11-14 07:44:05 -08:00
|
|
|
def valid_tweet?(tokens, limit)
|
|
|
|
tweet = NLP.reconstruct(tokens)
|
|
|
|
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
|
|
|
end
|
|
|
|
|
2013-11-18 02:59:15 -08:00
|
|
|
def make_statement(limit=140, generator=nil, retry_limit=10)
|
2013-11-14 07:44:05 -08:00
|
|
|
responding = !generator.nil?
|
2013-11-14 10:19:08 -08:00
|
|
|
generator ||= SuffixGenerator.build(@sentences)
|
2013-11-18 02:59:15 -08:00
|
|
|
|
|
|
|
retries = 0
|
2013-11-08 06:02:05 +11:00
|
|
|
tweet = ""
|
|
|
|
|
2013-11-14 07:44:05 -08:00
|
|
|
while (tokens = generator.generate(3, :bigrams)) do
|
|
|
|
next if tokens.length <= 3 && !responding
|
|
|
|
break if valid_tweet?(tokens, limit)
|
2013-11-18 02:59:15 -08:00
|
|
|
|
|
|
|
retries += 1
|
|
|
|
break if retries >= retry_limit
|
2013-11-14 07:44:05 -08:00
|
|
|
end
|
|
|
|
|
2013-11-18 02:59:15 -08:00
|
|
|
if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
2013-11-14 07:44:05 -08:00
|
|
|
while (tokens = generator.generate(3, :unigrams)) do
|
2013-11-18 02:59:15 -08:00
|
|
|
break if valid_tweet?(tokens, limit) && !verbatim?(tokens)
|
|
|
|
|
|
|
|
retries += 1
|
|
|
|
break if retries >= retry_limit
|
2013-11-14 07:44:05 -08:00
|
|
|
end
|
2013-11-08 06:02:05 +11:00
|
|
|
end
|
|
|
|
|
2013-11-14 07:44:05 -08:00
|
|
|
tweet = NLP.reconstruct(tokens)
|
|
|
|
|
2013-11-18 02:59:15 -08:00
|
|
|
if retries >= retry_limit
|
|
|
|
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
|
|
|
|
end
|
|
|
|
|
2013-11-08 06:02:05 +11:00
|
|
|
fix tweet
|
|
|
|
end
|
|
|
|
|
2013-11-18 02:59:15 -08:00
|
|
|
# Test if a sentence has been copied verbatim from original
|
|
|
|
def verbatim?(tokens)
|
|
|
|
@sentences.include?(tokens) || @mentions.include?(tokens)
|
|
|
|
end
|
|
|
|
|
2013-11-08 06:02:05 +11:00
|
|
|
# Finds all relevant tokenized sentences to given input by
|
|
|
|
# comparing non-stopword token overlaps
|
2013-11-18 02:59:15 -08:00
|
|
|
def find_relevant(sentences, input)
|
2013-11-08 06:02:05 +11:00
|
|
|
relevant = []
|
|
|
|
slightly_relevant = []
|
|
|
|
|
2013-11-18 02:59:15 -08:00
|
|
|
tokenized = NLP.tokenize(input).map(&:downcase)
|
2013-11-08 06:02:05 +11:00
|
|
|
|
2013-11-18 02:59:15 -08:00
|
|
|
sentences.each do |sent|
|
2013-11-08 06:02:05 +11:00
|
|
|
tokenized.each do |token|
|
2013-11-18 02:59:15 -08:00
|
|
|
if sent.map(&:downcase).include?(token)
|
2013-11-08 06:02:05 +11:00
|
|
|
relevant << sent unless NLP.stopword?(token)
|
|
|
|
slightly_relevant << sent
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
[relevant, slightly_relevant]
|
|
|
|
end
|
|
|
|
|
|
|
|
# Generates a response by looking for related sentences
|
2013-11-14 07:44:05 -08:00
|
|
|
# in the corpus and building a smaller generator from these
|
2013-11-18 02:59:15 -08:00
|
|
|
def make_response(input, limit=140, sentences=@mentions)
|
|
|
|
# Prefer mentions
|
|
|
|
relevant, slightly_relevant = find_relevant(sentences, input)
|
2013-11-08 06:02:05 +11:00
|
|
|
|
|
|
|
if relevant.length >= 3
|
2013-11-14 07:44:05 -08:00
|
|
|
generator = SuffixGenerator.build(relevant)
|
|
|
|
make_statement(limit, generator)
|
|
|
|
elsif slightly_relevant.length >= 5
|
|
|
|
generator = SuffixGenerator.build(slightly_relevant)
|
|
|
|
make_statement(limit, generator)
|
2013-11-18 02:59:15 -08:00
|
|
|
elsif sentences.equal?(@mentions)
|
|
|
|
make_response(input, limit, @sentences)
|
2013-11-08 06:02:05 +11:00
|
|
|
else
|
2013-11-14 07:44:05 -08:00
|
|
|
make_statement(limit)
|
2013-11-08 06:02:05 +11:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|