Lots of documentation and cleanup
This commit is contained in:
parent
efde0fd16f
commit
1977445b1c
11 changed files with 237 additions and 178 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,3 +1,5 @@
|
|||
.*.swp
|
||||
Gemfile.lock
|
||||
pkg
|
||||
.yardoc
|
||||
doc
|
||||
|
|
|
@ -4,8 +4,6 @@
|
|||
require 'twitter_ebooks'
|
||||
require 'ostruct'
|
||||
|
||||
$debug = true
|
||||
|
||||
module Ebooks::CLI
|
||||
APP_PATH = Dir.pwd # XXX do some recursive thing instead
|
||||
HELP = OpenStruct.new
|
||||
|
|
|
@ -15,7 +15,6 @@ end
|
|||
|
||||
require 'twitter_ebooks/nlp'
|
||||
require 'twitter_ebooks/archive'
|
||||
require 'twitter_ebooks/markov'
|
||||
require 'twitter_ebooks/suffix'
|
||||
require 'twitter_ebooks/model'
|
||||
require 'twitter_ebooks/bot'
|
||||
|
|
|
@ -6,10 +6,11 @@ module Ebooks
|
|||
class ConfigurationError < Exception
|
||||
end
|
||||
|
||||
# Information about a particular Twitter user we know
|
||||
class UserInfo
|
||||
attr_reader :username
|
||||
|
||||
# number of times we've interacted with a timeline tweet, unprompted
|
||||
# @return [Integer] how many times we can pester this user unprompted
|
||||
attr_accessor :pesters_left
|
||||
|
||||
def initialize(username)
|
||||
|
@ -17,6 +18,7 @@ module Ebooks
|
|||
@pesters_left = 1
|
||||
end
|
||||
|
||||
# @return [Boolean] true if we're allowed to pester this user
|
||||
def can_pester?
|
||||
@pesters_left > 0
|
||||
end
|
||||
|
@ -32,6 +34,7 @@ module Ebooks
|
|||
@last_update = Time.now
|
||||
end
|
||||
|
||||
# @param tweet [Twitter::Tweet] tweet to add
|
||||
def add(tweet)
|
||||
@tweets << tweet
|
||||
@last_update = Time.now
|
||||
|
@ -61,14 +64,24 @@ module Ebooks
|
|||
|
||||
# Meta information about a tweet that we calculate for ourselves
|
||||
class TweetMeta
|
||||
attr_accessor :mentions # array: usernames mentioned in tweet
|
||||
attr_accessor :mentionless # string: text of tweet with mentions removed
|
||||
attr_accessor :reply_mentions # array: usernames to include in a reply
|
||||
attr_accessor :reply_prefix # string: processed string to start reply with
|
||||
attr_accessor :limit # integer: available room to calculate reply
|
||||
# @return [Array<String>] usernames mentioned in tweet
|
||||
attr_accessor :mentions
|
||||
# @return [String] text of tweets with mentions removed
|
||||
attr_accessor :mentionless
|
||||
# @return [Array<String>] usernames to include in a reply
|
||||
attr_accessor :reply_mentions
|
||||
# @return [String] mentions to start reply with
|
||||
attr_accessor :reply_prefix
|
||||
# @return [Integer] available chars for reply
|
||||
attr_accessor :limit
|
||||
|
||||
attr_accessor :bot, :tweet
|
||||
# @return [Ebooks::Bot] associated bot
|
||||
attr_accessor :bot
|
||||
# @return [Twitter::Tweet] associated tweet
|
||||
attr_accessor :tweet
|
||||
|
||||
# Check whether this tweet mentions our bot
|
||||
# @return [Boolean]
|
||||
def mentions_bot?
|
||||
# To check if this is someone talking to us, ensure:
|
||||
# - The tweet mentions list contains our username
|
||||
|
@ -110,47 +123,65 @@ module Ebooks
|
|||
end
|
||||
|
||||
class Bot
|
||||
attr_accessor :consumer_key, :consumer_secret,
|
||||
:access_token, :access_token_secret
|
||||
|
||||
attr_reader :twitter, :stream, :thread
|
||||
|
||||
# Configuration
|
||||
attr_accessor :username, :delay_range, :blacklist
|
||||
|
||||
# @return [String] OAuth consumer key for a Twitter app
|
||||
attr_accessor :consumer_key
|
||||
# @return [String] OAuth consumer secret for a Twitter app
|
||||
attr_accessor :consumer_secret
|
||||
# @return [String] OAuth access token from `ebooks auth`
|
||||
attr_accessor :access_token
|
||||
# @return [String] OAuth access secret from `ebooks auth`
|
||||
attr_accessor :access_token_secret
|
||||
# @return [String] Twitter username of bot
|
||||
attr_accessor :username
|
||||
# @return [Array<String>] list of usernames to block on contact
|
||||
attr_accessor :blacklist
|
||||
# @return [Hash{String => Ebooks::Conversation}] maps tweet ids to their conversation contexts
|
||||
attr_accessor :conversations
|
||||
# @return [Range, Integer] range of seconds to delay in delay method
|
||||
attr_accessor :delay
|
||||
|
||||
@@all = [] # List of all defined bots
|
||||
def self.all; @@all; end
|
||||
# @return [Array] list of all defined bots
|
||||
def self.all; @@all ||= []; end
|
||||
|
||||
def self.get(name)
|
||||
all.find { |bot| bot.username == name }
|
||||
# Fetches a bot by username
|
||||
# @param username [String]
|
||||
# @return [Ebooks::Bot]
|
||||
def self.get(username)
|
||||
all.find { |bot| bot.username == username }
|
||||
end
|
||||
|
||||
# Logs info to stdout in the context of this bot
|
||||
def log(*args)
|
||||
STDOUT.print "@#{@username}: " + args.map(&:to_s).join(' ') + "\n"
|
||||
STDOUT.flush
|
||||
end
|
||||
|
||||
def initialize(*args, &b)
|
||||
@username ||= nil
|
||||
# Initializes and configures bot
|
||||
# @param args Arguments passed to configure method
|
||||
# @param b Block to call with new bot
|
||||
def initialize(username, &b)
|
||||
@blacklist ||= []
|
||||
@delay_range ||= 0
|
||||
|
||||
@users ||= {}
|
||||
@userinfo ||= {}
|
||||
@conversations ||= {}
|
||||
configure(*args, &b)
|
||||
|
||||
# Tweet ids we've already observed, to avoid duplication
|
||||
@seen_tweets ||= {}
|
||||
|
||||
@username = username
|
||||
configure(*args, &b)
|
||||
|
||||
Bot.all << self
|
||||
end
|
||||
|
||||
# Find information we've collected about a user
|
||||
# @param username [String]
|
||||
# @return [Ebooks::UserInfo]
|
||||
def userinfo(username)
|
||||
@users[username] ||= UserInfo.new(username)
|
||||
@userinfo[username] ||= UserInfo.new(username)
|
||||
end
|
||||
|
||||
# Grab or create the conversation context for this tweet
|
||||
# Find or create the conversation context for this tweet
|
||||
# @param tweet [Twitter::Tweet]
|
||||
# @return [Ebooks::Conversation]
|
||||
def conversation(tweet)
|
||||
conv = if tweet.in_reply_to_status_id?
|
||||
@conversations[tweet.in_reply_to_status_id]
|
||||
|
@ -175,6 +206,7 @@ module Ebooks
|
|||
conv
|
||||
end
|
||||
|
||||
# @return [Twitter::REST::Client] underlying REST client from twitter gem
|
||||
def twitter
|
||||
@twitter ||= Twitter::REST::Client.new do |config|
|
||||
config.consumer_key = @consumer_key
|
||||
|
@ -184,6 +216,7 @@ module Ebooks
|
|||
end
|
||||
end
|
||||
|
||||
# @return [Twitter::Streaming::Client] underlying streaming client from twitter gem
|
||||
def stream
|
||||
@stream ||= Twitter::Streaming::Client.new do |config|
|
||||
config.consumer_key = @consumer_key
|
||||
|
@ -194,11 +227,14 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Calculate some meta information about a tweet relevant for replying
|
||||
# @param ev [Twitter::Tweet]
|
||||
# @return [Ebooks::TweetMeta]
|
||||
def calc_meta(ev)
|
||||
TweetMeta.new(self, ev)
|
||||
end
|
||||
|
||||
# Receive an event from the twitter stream
|
||||
# @param ev [Object] Twitter streaming event
|
||||
def receive_event(ev)
|
||||
if ev.is_a? Array # Initial array sent on first connection
|
||||
log "Online!"
|
||||
|
@ -250,14 +286,7 @@ module Ebooks
|
|||
end
|
||||
end
|
||||
|
||||
def start_stream
|
||||
log "starting tweet stream"
|
||||
|
||||
stream.user do |ev|
|
||||
receive_event ev
|
||||
end
|
||||
end
|
||||
|
||||
# Configures client and fires startup event
|
||||
def prepare
|
||||
# Sanity check
|
||||
if @username.nil?
|
||||
|
@ -268,12 +297,18 @@ module Ebooks
|
|||
fire(:startup)
|
||||
end
|
||||
|
||||
# Connects to tweetstream and opens event handlers for this bot
|
||||
# Start running user event stream
|
||||
def start
|
||||
start_stream
|
||||
log "starting tweet stream"
|
||||
|
||||
stream.user do |ev|
|
||||
receive_event ev
|
||||
end
|
||||
end
|
||||
|
||||
# Fire an event
|
||||
# @param event [Symbol] event to fire
|
||||
# @param args arguments for event handler
|
||||
def fire(event, *args)
|
||||
handler = "on_#{event}".to_sym
|
||||
if respond_to? handler
|
||||
|
@ -281,11 +316,17 @@ module Ebooks
|
|||
end
|
||||
end
|
||||
|
||||
def delay(&b)
|
||||
time = @delay.to_a.sample unless @delay.is_a? Integer
|
||||
# Delay an action for a variable period of time
|
||||
# @param range [Range, Integer] range of seconds to choose for delay
|
||||
def delay(range=@delay_range, &b)
|
||||
time = range.to_a.sample unless range.is_a? Integer
|
||||
sleep time
|
||||
b.call
|
||||
end
|
||||
|
||||
# Check if a username is blacklisted
|
||||
# @param username [String]
|
||||
# @return [Boolean]
|
||||
def blacklisted?(username)
|
||||
if @blacklist.include?(username)
|
||||
true
|
||||
|
@ -295,6 +336,9 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Reply to a tweet or a DM.
|
||||
# @param ev [Twitter::Tweet, Twitter::DirectMessage]
|
||||
# @param text [String] contents of reply excluding reply_prefix
|
||||
# @param opts [Hash] additional params to pass to twitter gem
|
||||
def reply(ev, text, opts={})
|
||||
opts = opts.clone
|
||||
|
||||
|
@ -306,26 +350,28 @@ module Ebooks
|
|||
|
||||
if conversation(ev).is_bot?(ev.user.screen_name)
|
||||
log "Not replying to suspected bot @#{ev.user.screen_name}"
|
||||
return
|
||||
return false
|
||||
end
|
||||
|
||||
if !meta.mentions_bot?
|
||||
if !userinfo(ev.user.screen_name).can_pester?
|
||||
log "Not replying: leaving @#{ev.user.screen_name} alone"
|
||||
return
|
||||
return false
|
||||
end
|
||||
end
|
||||
|
||||
log "Replying to @#{ev.user.screen_name} with: #{meta.reply_prefix + text}"
|
||||
tweet = twitter.update(meta.reply_prefix + text, in_reply_to_status_id: ev.id)
|
||||
conversation(tweet).add(tweet)
|
||||
tweet
|
||||
else
|
||||
raise Exception("Don't know how to reply to a #{ev.class}")
|
||||
end
|
||||
end
|
||||
|
||||
# Favorite a tweet
|
||||
# @param tweet [Twitter::Tweet]
|
||||
def favorite(tweet)
|
||||
return if blacklisted?(tweet.user.screen_name)
|
||||
log "Favoriting @#{tweet.user.screen_name}: #{tweet.text}"
|
||||
|
||||
begin
|
||||
|
@ -335,6 +381,8 @@ module Ebooks
|
|||
end
|
||||
end
|
||||
|
||||
# Retweet a tweet
|
||||
# @param tweet [Twitter::Tweet]
|
||||
def retweet(tweet)
|
||||
log "Retweeting @#{tweet.user.screen_name}: #{tweet.text}"
|
||||
|
||||
|
@ -345,26 +393,36 @@ module Ebooks
|
|||
end
|
||||
end
|
||||
|
||||
def follow(*args)
|
||||
log "Following #{args}"
|
||||
twitter.follow(*args)
|
||||
# Follow a user
|
||||
# @param user [String] username or user id
|
||||
def follow(user, *args)
|
||||
log "Following #{user}"
|
||||
twitter.follow(user, *args)
|
||||
end
|
||||
|
||||
def unfollow(*args)
|
||||
log "Unfollowing #{args}"
|
||||
twiter.unfollow(*args)
|
||||
# Unfollow a user
|
||||
# @param user [String] username or user id
|
||||
def unfollow(user, *args)
|
||||
log "Unfollowing #{user}"
|
||||
twiter.unfollow(user, *args)
|
||||
end
|
||||
|
||||
def tweet(*args)
|
||||
log "Tweeting #{args.inspect}"
|
||||
twitter.update(*args)
|
||||
# Tweet something
|
||||
# @param text [String]
|
||||
def tweet(text, *args)
|
||||
log "Tweeting '#{text}'"
|
||||
twitter.update(text, *args)
|
||||
end
|
||||
|
||||
# Get a scheduler for this bot
|
||||
# @return [Rufus::Scheduler]
|
||||
def scheduler
|
||||
@scheduler ||= Rufus::Scheduler.new
|
||||
end
|
||||
|
||||
# could easily just be *args however the separation keeps it clean.
|
||||
# Tweet some text with an image
|
||||
# @param txt [String]
|
||||
# @param pic [String] filename
|
||||
def pictweet(txt, pic, *args)
|
||||
log "Tweeting #{txt.inspect} - #{pic} #{args}"
|
||||
twitter.update_with_media(txt, File.new(pic), *args)
|
||||
|
|
|
@ -1,82 +0,0 @@
|
|||
module Ebooks
|
||||
# Special INTERIM token represents sentence boundaries
|
||||
# This is so we can include start and end of statements in model
|
||||
# Due to the way the sentence tokenizer works, can correspond
|
||||
# to multiple actual parts of text (such as ^, $, \n and .?!)
|
||||
INTERIM = :interim
|
||||
|
||||
# This is an ngram-based Markov model optimized to build from a
|
||||
# tokenized sentence list without requiring too much transformation
|
||||
class MarkovModel
|
||||
def self.build(sentences)
|
||||
MarkovModel.new.consume(sentences)
|
||||
end
|
||||
|
||||
def consume(sentences)
|
||||
# These models are of the form ngram => [[sentence_pos, token_pos] || INTERIM, ...]
|
||||
# We map by both bigrams and unigrams so we can fall back to the latter in
|
||||
# cases where an input bigram is unavailable, such as starting a sentence
|
||||
@sentences = sentences
|
||||
@unigrams = {}
|
||||
@bigrams = {}
|
||||
|
||||
sentences.each_with_index do |tokens, i|
|
||||
last_token = INTERIM
|
||||
tokens.each_with_index do |token, j|
|
||||
@unigrams[last_token] ||= []
|
||||
@unigrams[last_token] << [i, j]
|
||||
|
||||
@bigrams[last_token] ||= {}
|
||||
@bigrams[last_token][token] ||= []
|
||||
|
||||
if j == tokens.length-1 # Mark sentence endings
|
||||
@unigrams[token] ||= []
|
||||
@unigrams[token] << INTERIM
|
||||
@bigrams[last_token][token] << INTERIM
|
||||
else
|
||||
@bigrams[last_token][token] << [i, j+1]
|
||||
end
|
||||
|
||||
last_token = token
|
||||
end
|
||||
end
|
||||
|
||||
self
|
||||
end
|
||||
|
||||
def find_token(index)
|
||||
if index == INTERIM
|
||||
INTERIM
|
||||
else
|
||||
@sentences[index[0]][index[1]]
|
||||
end
|
||||
end
|
||||
|
||||
def chain(tokens)
|
||||
if tokens.length == 1
|
||||
matches = @unigrams[tokens[-1]]
|
||||
else
|
||||
matches = @bigrams[tokens[-2]][tokens[-1]]
|
||||
matches = @unigrams[tokens[-1]] if matches.length < 2
|
||||
end
|
||||
|
||||
if matches.empty?
|
||||
# This should never happen unless a strange token is
|
||||
# supplied from outside the dataset
|
||||
raise ArgumentError, "Unable to continue chain for: #{tokens.inspect}"
|
||||
end
|
||||
|
||||
next_token = find_token(matches.sample)
|
||||
|
||||
if next_token == INTERIM # We chose to end the sentence
|
||||
return tokens
|
||||
else
|
||||
return chain(tokens + [next_token])
|
||||
end
|
||||
end
|
||||
|
||||
def generate
|
||||
NLP.reconstruct(chain([INTERIM]))
|
||||
end
|
||||
end
|
||||
end
|
|
@ -8,16 +8,41 @@ require 'csv'
|
|||
|
||||
module Ebooks
|
||||
class Model
|
||||
attr_accessor :hash, :tokens, :sentences, :mentions, :keywords
|
||||
# @return [Array<String>]
|
||||
# An array of unique tokens. This is the main source of actual strings
|
||||
# in the model. Manipulation of a token is done using its index
|
||||
# in this array, which we call a "tiki"
|
||||
attr_accessor :tokens
|
||||
|
||||
def self.consume(txtpath)
|
||||
Model.new.consume(txtpath)
|
||||
# @return [Array<Array<Integer>>]
|
||||
# Sentences represented by arrays of tikis
|
||||
attr_accessor :sentences
|
||||
|
||||
# @return [Array<Array<Integer>>]
|
||||
# Sentences derived from Twitter mentions
|
||||
attr_accessor :mentions
|
||||
|
||||
# @return [Array<String>]
|
||||
# The top 200 most important keywords, in descending order
|
||||
attr_accessor :keywords
|
||||
|
||||
# Generate a new model from a corpus file
|
||||
# @param path [String]
|
||||
# @return [Ebooks::Model]
|
||||
def self.consume(path)
|
||||
Model.new.consume(path)
|
||||
end
|
||||
|
||||
# Generate a new model from multiple corpus files
|
||||
# @param paths [Array<String>]
|
||||
# @return [Ebooks::Model]
|
||||
def self.consume_all(paths)
|
||||
Model.new.consume_all(paths)
|
||||
end
|
||||
|
||||
# Load a saved model
|
||||
# @param path [String]
|
||||
# @return [Ebooks::Model]
|
||||
def self.load(path)
|
||||
model = Model.new
|
||||
model.instance_eval do
|
||||
|
@ -30,6 +55,8 @@ module Ebooks
|
|||
model
|
||||
end
|
||||
|
||||
# Save model to a file
|
||||
# @param path [String]
|
||||
def save(path)
|
||||
File.open(path, 'wb') do |f|
|
||||
f.write(Marshal.dump({
|
||||
|
@ -43,19 +70,22 @@ module Ebooks
|
|||
end
|
||||
|
||||
def initialize
|
||||
# This is the only source of actual strings in the model. It is
|
||||
# an array of unique tokens. Manipulation of a token is mostly done
|
||||
# using its index in this array, which we call a "tiki"
|
||||
@tokens = []
|
||||
|
||||
# Reverse lookup tiki by token, for faster generation
|
||||
@tikis = {}
|
||||
end
|
||||
|
||||
# Reverse lookup a token index from a token
|
||||
# @param token [String]
|
||||
# @return [Integer]
|
||||
def tikify(token)
|
||||
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
|
||||
end
|
||||
|
||||
# Convert a body of text into arrays of tikis
|
||||
# @param text [String]
|
||||
# @return [Array<Array<Integer>>]
|
||||
def mass_tikify(text)
|
||||
sentences = NLP.sentences(text)
|
||||
|
||||
|
@ -69,9 +99,10 @@ module Ebooks
|
|||
end
|
||||
end
|
||||
|
||||
# Consume a corpus into this model
|
||||
# @param path [String]
|
||||
def consume(path)
|
||||
content = File.read(path, :encoding => 'utf-8')
|
||||
@hash = Digest::MD5.hexdigest(content)
|
||||
|
||||
if path.split('.')[-1] == "json"
|
||||
log "Reading json corpus from #{path}"
|
||||
|
@ -94,6 +125,8 @@ module Ebooks
|
|||
consume_lines(lines)
|
||||
end
|
||||
|
||||
# Consume a sequence of lines
|
||||
# @param lines [Array<String>]
|
||||
def consume_lines(lines)
|
||||
log "Removing commented lines and sorting mentions"
|
||||
|
||||
|
@ -126,11 +159,12 @@ module Ebooks
|
|||
self
|
||||
end
|
||||
|
||||
# Consume multiple corpuses into this model
|
||||
# @param paths [Array<String>]
|
||||
def consume_all(paths)
|
||||
lines = []
|
||||
paths.each do |path|
|
||||
content = File.read(path, :encoding => 'utf-8')
|
||||
@hash = Digest::MD5.hexdigest(content)
|
||||
|
||||
if path.split('.')[-1] == "json"
|
||||
log "Reading json corpus from #{path}"
|
||||
|
@ -156,25 +190,26 @@ module Ebooks
|
|||
consume_lines(lines)
|
||||
end
|
||||
|
||||
def fix(tweet)
|
||||
# This seems to require an external api call
|
||||
#begin
|
||||
# fixer = NLP.gingerice.parse(tweet)
|
||||
# log fixer if fixer['corrections']
|
||||
# tweet = fixer['result']
|
||||
#rescue Exception => e
|
||||
# log e.message
|
||||
# log e.backtrace
|
||||
#end
|
||||
|
||||
NLP.htmlentities.decode tweet
|
||||
# Correct encoding issues in generated text
|
||||
# @param text [String]
|
||||
# @return [String]
|
||||
def fix(text)
|
||||
NLP.htmlentities.decode text
|
||||
end
|
||||
|
||||
# Check if an array of tikis comprises a valid tweet
|
||||
# @param tikis [Array<Integer>]
|
||||
# @param limit Integer how many chars we have left
|
||||
def valid_tweet?(tikis, limit)
|
||||
tweet = NLP.reconstruct(tikis, @tokens)
|
||||
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
||||
end
|
||||
|
||||
# Generate some text
|
||||
# @param limit [Integer] available characters
|
||||
# @param generator [SuffixGenerator, nil]
|
||||
# @param retry_limit [Integer] how many times to retry on duplicates
|
||||
# @return [String]
|
||||
def make_statement(limit=140, generator=nil, retry_limit=10)
|
||||
responding = !generator.nil?
|
||||
generator ||= SuffixGenerator.build(@sentences)
|
||||
|
@ -209,12 +244,17 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Test if a sentence has been copied verbatim from original
|
||||
def verbatim?(tokens)
|
||||
@sentences.include?(tokens) || @mentions.include?(tokens)
|
||||
# @param tikis [Array<Integer>]
|
||||
# @return [Boolean]
|
||||
def verbatim?(tikis)
|
||||
@sentences.include?(tikis) || @mentions.include?(tikis)
|
||||
end
|
||||
|
||||
# Finds all relevant tokenized sentences to given input by
|
||||
# Finds relevant and slightly relevant tokenized sentences to input
|
||||
# comparing non-stopword token overlaps
|
||||
# @param sentences [Array<Array<Integer>>]
|
||||
# @param input [String]
|
||||
# @return [Array<Array<Array<Integer>>, Array<Array<Integer>>>]
|
||||
def find_relevant(sentences, input)
|
||||
relevant = []
|
||||
slightly_relevant = []
|
||||
|
@ -235,6 +275,10 @@ module Ebooks
|
|||
|
||||
# Generates a response by looking for related sentences
|
||||
# in the corpus and building a smaller generator from these
|
||||
# @param input [String]
|
||||
# @param limit [Integer] characters available for response
|
||||
# @param sentences [Array<Array<Integer>>]
|
||||
# @return [String]
|
||||
def make_response(input, limit=140, sentences=@mentions)
|
||||
# Prefer mentions
|
||||
relevant, slightly_relevant = find_relevant(sentences, input)
|
||||
|
|
|
@ -12,31 +12,35 @@ module Ebooks
|
|||
# Some of this stuff is pretty heavy and we don't necessarily need
|
||||
# to be using it all of the time
|
||||
|
||||
# Lazily loads an array of stopwords
|
||||
# Stopwords are common English words that should often be ignored
|
||||
# @return [Array<String>]
|
||||
def self.stopwords
|
||||
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
|
||||
end
|
||||
|
||||
# Lazily loads an array of known English nouns
|
||||
# @return [Array<String>]
|
||||
def self.nouns
|
||||
@nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
|
||||
end
|
||||
|
||||
# Lazily loads an array of known English adjectives
|
||||
# @return [Array<String>]
|
||||
def self.adjectives
|
||||
@adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
|
||||
end
|
||||
|
||||
# POS tagger
|
||||
# Lazily load part-of-speech tagging library
|
||||
# This can determine whether a word is being used as a noun/adjective/verb
|
||||
# @return [EngTagger]
|
||||
def self.tagger
|
||||
require 'engtagger'
|
||||
@tagger ||= EngTagger.new
|
||||
end
|
||||
|
||||
# Gingerice text correction service
|
||||
def self.gingerice
|
||||
require 'gingerice'
|
||||
Gingerice::Parser.new # No caching for this one
|
||||
end
|
||||
|
||||
# For decoding html entities
|
||||
# Lazily load HTML entity decoder
|
||||
# @return [HTMLEntities]
|
||||
def self.htmlentities
|
||||
require 'htmlentities'
|
||||
@htmlentities ||= HTMLEntities.new
|
||||
|
@ -44,7 +48,9 @@ module Ebooks
|
|||
|
||||
### Utility functions
|
||||
|
||||
# We don't really want to deal with all this weird unicode punctuation
|
||||
# Normalize some strange unicode punctuation variants
|
||||
# @param text [String]
|
||||
# @return [String]
|
||||
def self.normalize(text)
|
||||
htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
|
||||
end
|
||||
|
@ -53,6 +59,8 @@ module Ebooks
|
|||
# We use ad hoc approach because fancy libraries do not deal
|
||||
# especially well with tweet formatting, and we can fake solving
|
||||
# the quote problem during generation
|
||||
# @param text [String]
|
||||
# @return [Array<String>]
|
||||
def self.sentences(text)
|
||||
text.split(/\n+|(?<=[.?!])\s+/)
|
||||
end
|
||||
|
@ -60,15 +68,23 @@ module Ebooks
|
|||
# Split a sentence into word-level tokens
|
||||
# As above, this is ad hoc because tokenization libraries
|
||||
# do not behave well wrt. things like emoticons and timestamps
|
||||
# @param sentence [String]
|
||||
# @return [Array<String>]
|
||||
def self.tokenize(sentence)
|
||||
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
||||
sentence.split(regex)
|
||||
end
|
||||
|
||||
# Get the 'stem' form of a word e.g. 'cats' -> 'cat'
|
||||
# @param word [String]
|
||||
# @return [String]
|
||||
def self.stem(word)
|
||||
Stemmer::stem_word(word.downcase)
|
||||
end
|
||||
|
||||
# Use highscore gem to find interesting keywords in a corpus
|
||||
# @param text [String]
|
||||
# @return [Highscore::Keywords]
|
||||
def self.keywords(text)
|
||||
# Preprocess to remove stopwords (highscore's blacklist is v. slow)
|
||||
text = NLP.tokenize(text).reject { |t| stopword?(t) }.join(' ')
|
||||
|
@ -90,7 +106,10 @@ module Ebooks
|
|||
text.keywords
|
||||
end
|
||||
|
||||
# Takes a list of tokens and builds a nice-looking sentence
|
||||
# Builds a proper sentence from a list of tikis
|
||||
# @param tikis [Array<Integer>]
|
||||
# @param tokens [Array<String>]
|
||||
# @return [String]
|
||||
def self.reconstruct(tikis, tokens)
|
||||
text = ""
|
||||
last_token = nil
|
||||
|
@ -105,6 +124,9 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Determine if we need to insert a space between two tokens
|
||||
# @param token1 [String]
|
||||
# @param token2 [String]
|
||||
# @return [Boolean]
|
||||
def self.space_between?(token1, token2)
|
||||
p1 = self.punctuation?(token1)
|
||||
p2 = self.punctuation?(token2)
|
||||
|
@ -119,10 +141,16 @@ module Ebooks
|
|||
end
|
||||
end
|
||||
|
||||
# Is this token comprised of punctuation?
|
||||
# @param token [String]
|
||||
# @return [Boolean]
|
||||
def self.punctuation?(token)
|
||||
(token.chars.to_set - PUNCTUATION.chars.to_set).empty?
|
||||
end
|
||||
|
||||
# Is this token a stopword?
|
||||
# @param token [String]
|
||||
# @return [Boolean]
|
||||
def self.stopword?(token)
|
||||
@stopword_set ||= stopwords.map(&:downcase).to_set
|
||||
@stopword_set.include?(token.downcase)
|
||||
|
@ -130,7 +158,9 @@ module Ebooks
|
|||
|
||||
# Determine if a sample of text contains unmatched brackets or quotes
|
||||
# This is one of the more frequent and noticeable failure modes for
|
||||
# the markov generator; we can just tell it to retry
|
||||
# the generator; we can just tell it to retry
|
||||
# @param text [String]
|
||||
# @return [Boolean]
|
||||
def self.unmatched_enclosers?(text)
|
||||
enclosers = ['**', '""', '()', '[]', '``', "''"]
|
||||
enclosers.each do |pair|
|
||||
|
@ -153,10 +183,13 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Determine if a2 is a subsequence of a1
|
||||
# @param a1 [Array]
|
||||
# @param a2 [Array]
|
||||
# @return [Boolean]
|
||||
def self.subseq?(a1, a2)
|
||||
a1.each_index.find do |i|
|
||||
!a1.each_index.find do |i|
|
||||
a1[i...i+a2.length] == a2
|
||||
end
|
||||
end.nil?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
# encoding: utf-8
|
||||
|
||||
module Ebooks
|
||||
# This generator uses data identical to the markov model, but
|
||||
# This generator uses data identical to a markov model, but
|
||||
# instead of making a chain by looking up bigrams it uses the
|
||||
# positions to randomly replace suffixes in one sentence with
|
||||
# matching suffixes in another
|
||||
class SuffixGenerator
|
||||
# Build a generator from a corpus of tikified sentences
|
||||
# @param sentences [Array<Array<Integer>>]
|
||||
# @return [SuffixGenerator]
|
||||
def self.build(sentences)
|
||||
SuffixGenerator.new(sentences)
|
||||
end
|
||||
|
@ -39,6 +42,11 @@ module Ebooks
|
|||
self
|
||||
end
|
||||
|
||||
|
||||
# Generate a recombined sequence of tikis
|
||||
# @param passes [Integer] number of times to recombine
|
||||
# @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
|
||||
# @return [Array<Integer>]
|
||||
def generate(passes=5, n=:unigrams)
|
||||
index = rand(@sentences.length)
|
||||
tikis = @sentences[index]
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
module Ebooks
|
||||
VERSION = "2.3.2"
|
||||
VERSION = "3.0.0"
|
||||
end
|
||||
|
|
|
@ -3,8 +3,6 @@ require 'memory_profiler'
|
|||
require 'tempfile'
|
||||
require 'timecop'
|
||||
|
||||
def Process.rss; `ps -o rss= -p #{Process.pid}`.chomp.to_i; end
|
||||
|
||||
class TestBot < Ebooks::Bot
|
||||
attr_accessor :twitter
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ Gem::Specification.new do |gem|
|
|||
gem.add_development_dependency 'memory_profiler'
|
||||
gem.add_development_dependency 'timecop'
|
||||
gem.add_development_dependency 'pry-byebug'
|
||||
gem.add_development_dependency 'yard'
|
||||
|
||||
gem.add_runtime_dependency 'twitter', '~> 5.0'
|
||||
gem.add_runtime_dependency 'simple_oauth'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue