twitter-ebooks/lib/twitter_ebooks/model.rb

218 lines
5.8 KiB
Ruby
Raw Normal View History

2013-11-08 06:02:05 +11:00
#!/usr/bin/env ruby
# encoding: utf-8
require 'json'
require 'set'
require 'digest/md5'
require 'csv'
2013-11-08 06:02:05 +11:00
module Ebooks
class Model
2014-10-25 05:48:13 -07:00
attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
2013-11-08 06:02:05 +11:00
def self.consume(txtpath)
Model.new.consume(txtpath)
end
def self.load(path)
2014-10-25 05:48:13 -07:00
model = Model.new
model.instance_eval do
props = Marshal.load(File.open(path, 'rb') { |f| f.read })
@tokens = props[:tokens]
@sentences = props[:sentences]
@mentions = props[:mentions]
@keywords = props[:keywords]
end
model
2014-10-25 04:26:52 -07:00
end
def save(path)
File.open(path, 'wb') do |f|
f.write(Marshal.dump({
tokens: @tokens,
sentences: @sentences,
mentions: @mentions,
keywords: @keywords
}))
end
self
2013-11-08 06:02:05 +11:00
end
def initialize
# This is the only source of actual strings in the model. It is
# an array of unique tokens. Manipulation of a token is mostly done
# using its index in this array, which we call a "tiki"
@tokens = []
# Reverse lookup tiki by token, for faster generation
@tikis = {}
end
def tikify(token)
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
end
def mass_tikify(text)
2014-10-16 03:02:39 -07:00
sentences = NLP.sentences(text)
sentences.map do |s|
tokens = NLP.tokenize(s).reject do |t|
2014-10-16 03:02:39 -07:00
# Don't include usernames/urls as tokens
t.include?('@') || t.include?('http')
end
tokens.map { |t| tikify(t) }
end
2014-10-16 03:02:39 -07:00
end
2013-11-27 05:12:54 -08:00
def consume(path)
content = File.read(path, :encoding => 'utf-8')
2013-11-27 05:12:54 -08:00
@hash = Digest::MD5.hexdigest(content)
if path.split('.')[-1] == "json"
log "Reading json corpus from #{path}"
2014-10-16 03:02:39 -07:00
lines = JSON.parse(content).map do |tweet|
tweet['text']
2013-11-27 05:12:54 -08:00
end
elsif path.split('.')[-1] == "csv"
log "Reading CSV corpus from #{path}"
content = CSV.parse(content)
header = content.shift
2014-05-03 16:44:07 -06:00
text_col = header.index('text')
lines = content.map do |tweet|
2014-05-03 16:44:07 -06:00
tweet[text_col]
end
2013-11-27 05:12:54 -08:00
else
log "Reading plaintext corpus from #{path}"
lines = content.split("\n")
end
2013-11-08 06:02:05 +11:00
2013-11-18 02:59:15 -08:00
log "Removing commented lines and sorting mentions"
2013-11-08 06:02:05 +11:00
2014-10-16 03:02:39 -07:00
statements = []
2013-11-18 02:59:15 -08:00
mentions = []
2013-11-08 06:02:05 +11:00
lines.each do |l|
2013-11-18 02:59:15 -08:00
next if l.start_with?('#') # Remove commented lines
next if l.include?('RT') || l.include?('MT') # Remove soft retweets
2014-10-14 01:02:08 -07:00
2013-11-18 02:59:15 -08:00
if l.include?('@')
2014-10-16 03:02:39 -07:00
mentions << NLP.normalize(l)
else
statements << NLP.normalize(l)
2013-11-18 02:59:15 -08:00
end
2013-11-08 06:02:05 +11:00
end
2014-10-16 03:02:39 -07:00
text = statements.join("\n")
mention_text = mentions.join("\n")
2013-11-08 06:02:05 +11:00
2014-10-16 03:02:39 -07:00
lines = nil; statements = nil; mentions = nil # Allow garbage collection
2013-11-18 02:59:15 -08:00
2014-10-16 03:02:39 -07:00
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
2013-11-08 06:02:05 +11:00
@sentences = mass_tikify(text)
@mentions = mass_tikify(mention_text)
2013-11-08 06:02:05 +11:00
2014-10-18 22:21:50 -07:00
log "Ranking keywords"
@keywords = NLP.keywords(text).top(200).map(&:to_s)
2013-11-08 06:02:05 +11:00
self
end
def fix(tweet)
# This seems to require an external api call
#begin
# fixer = NLP.gingerice.parse(tweet)
# log fixer if fixer['corrections']
# tweet = fixer['result']
#rescue Exception => e
# log e.message
# log e.backtrace
#end
NLP.htmlentities.decode tweet
end
def valid_tweet?(tikis, limit)
tweet = NLP.reconstruct(tikis, @tokens)
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end
2013-11-18 02:59:15 -08:00
def make_statement(limit=140, generator=nil, retry_limit=10)
responding = !generator.nil?
2013-11-14 10:19:08 -08:00
generator ||= SuffixGenerator.build(@sentences)
2013-11-18 02:59:15 -08:00
retries = 0
2013-11-08 06:02:05 +11:00
tweet = ""
while (tikis = generator.generate(3, :bigrams)) do
next if tikis.length <= 3 && !responding
break if valid_tweet?(tikis, limit)
2013-11-18 02:59:15 -08:00
retries += 1
break if retries >= retry_limit
end
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
while (tikis = generator.generate(3, :unigrams)) do
break if valid_tweet?(tikis, limit) && !verbatim?(tikis)
2013-11-18 02:59:15 -08:00
retries += 1
break if retries >= retry_limit
end
2013-11-08 06:02:05 +11:00
end
tweet = NLP.reconstruct(tikis, @tokens)
2013-11-18 02:59:15 -08:00
if retries >= retry_limit
log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
end
2013-11-08 06:02:05 +11:00
fix tweet
end
2013-11-18 02:59:15 -08:00
# Test if a sentence has been copied verbatim from original
def verbatim?(tokens)
@sentences.include?(tokens) || @mentions.include?(tokens)
end
2013-11-08 06:02:05 +11:00
# Finds all relevant tokenized sentences to given input by
# comparing non-stopword token overlaps
2013-11-18 02:59:15 -08:00
def find_relevant(sentences, input)
2013-11-08 06:02:05 +11:00
relevant = []
slightly_relevant = []
2013-11-18 02:59:15 -08:00
tokenized = NLP.tokenize(input).map(&:downcase)
2013-11-08 06:02:05 +11:00
2013-11-18 02:59:15 -08:00
sentences.each do |sent|
2013-11-08 06:02:05 +11:00
tokenized.each do |token|
if sent.map { |tiki| @tokens[tiki].downcase }.include?(token)
2013-11-08 06:02:05 +11:00
relevant << sent unless NLP.stopword?(token)
slightly_relevant << sent
end
end
end
[relevant, slightly_relevant]
end
# Generates a response by looking for related sentences
# in the corpus and building a smaller generator from these
2013-11-18 02:59:15 -08:00
def make_response(input, limit=140, sentences=@mentions)
# Prefer mentions
relevant, slightly_relevant = find_relevant(sentences, input)
2013-11-08 06:02:05 +11:00
if relevant.length >= 3
generator = SuffixGenerator.build(relevant)
make_statement(limit, generator)
elsif slightly_relevant.length >= 5
generator = SuffixGenerator.build(slightly_relevant)
make_statement(limit, generator)
2013-11-18 02:59:15 -08:00
elsif sentences.equal?(@mentions)
make_response(input, limit, @sentences)
2013-11-08 06:02:05 +11:00
else
make_statement(limit)
2013-11-08 06:02:05 +11:00
end
end
end
end