Merge remote-tracking branch 'mispy/master'

This commit is contained in:
Eryn Wells 2017-05-07 14:47:01 +00:00
commit 7f3d372a61
17 changed files with 292 additions and 68 deletions

2
.gitattributes vendored Normal file
View file

@ -0,0 +1,2 @@
# Require \n style line endings
* text eol=lf

View file

@ -1,5 +1,5 @@
rvm:
- 2.1.4
- 2.1.7
script:
- rspec spec
notifications:

View file

@ -21,7 +21,7 @@ Note that 3.0 is not backwards compatible with 2.x, so upgrade carefully! In par
## Installation
Requires Ruby 2.0+
Requires Ruby 2.1+. Ruby 2.3+ is recommended.
```bash
gem install twitter_ebooks
@ -78,6 +78,16 @@ class MyBot < Ebooks::Bot
# Reply to a tweet in the bot's timeline
# reply(tweet, meta(tweet).reply_prefix + "nice tweet")
end
def on_favorite(user, tweet)
# Follow user who just favorited bot's tweet
# follow(user.screen_name)
end
def on_retweet(tweet)
# Follow user who just retweeted bot's tweet
# follow(tweet.user.screen_name)
end
end
# Make a MyBot and attach it to an account
@ -135,10 +145,10 @@ The secondary function is the "interesting keywords" list. For example, I use th
``` ruby
top100 = model.keywords.take(100)
tokens = Ebooks::NLP.tokenize(tweet[:text])
tokens = Ebooks::NLP.tokenize(tweet.text)
if tokens.find { |t| top100.include?(t) }
bot.favorite(tweet[:id])
favorite(tweet)
end
```

View file

@ -25,9 +25,12 @@ Usage:
ebooks auth
ebooks consume <corpus_path> [corpus_path2] [...]
ebooks consume-all <model_name> <corpus_path> [corpus_path2] [...]
ebooks append <model_name> <corpus_path>
ebooks gen <model_path> [input]
ebooks archive <username> [path]
ebooks sync <botname> [username]
ebooks tweet <model_path> <botname>
ebooks version
STR
def self.help(command=nil)
@ -91,7 +94,9 @@ STR
filename = File.basename(path)
shortname = filename.split('.')[0..-2].join('.')
FileUtils.mkdir_p(File.join(APP_PATH, 'model'))
outpath = File.join(APP_PATH, 'model', "#{shortname}.model")
Ebooks::Model.consume(path).save(outpath)
log "Corpus consumed to #{outpath}"
end
@ -115,6 +120,24 @@ STR
log "Corpuses consumed to #{outpath}"
end
HELP.append = <<-STR
Usage: ebooks append <model_name> <corpus_path>
Process then append the provided corpus to the model
instead of overwriting.
STR
def self.append(name, path)
if !name || !path
help :append
exit 1
end
Ebooks::Model.consume(path).append(File.join(APP_PATH,'model',"#{name}.model"))
log "Corpus appended to #{name}.model"
end
HELP.jsonify = <<-STR
Usage: ebooks jsonify <tweets.csv> [tweets.csv2] [...]
@ -189,6 +212,11 @@ STR
Output defaults to corpus/<username>.json
Due to API limitations, this can only receive up to ~3000 tweets
into the past.
The first time you run archive, you will need to enter the auth
details of some account to use for accessing the API. This info
will then be stored in ~/.ebooksrc for later use, and can be
modified there if needed.
STR
def self.archive(username, outpath=nil)
@ -200,6 +228,25 @@ STR
Ebooks::Archive.new(username, outpath).sync
end
HELP.sync = <<-STR
Usage: ebooks sync <botname> <username>
Copies and flips <username>'s avatar and cover photo, uploading them to <botname>'s profile.
Stores saved avatar's and covers in image/.
STR
def self.sync(botname, username)
if botname.nil?
help :sync
exit 1
end
load File.join(APP_PATH, 'bots.rb')
Ebooks::Sync::run(botname, username)
end
HELP.tweet = <<-STR
Usage: ebooks tweet <model_path> <botname>
@ -217,6 +264,10 @@ STR
model = Ebooks::Model.load(modelpath)
statement = model.make_statement
bot = Ebooks::Bot.get(botname)
if bot.nil?
log "No such bot configured in bots.rb: #{botname}"
exit 1
end
bot.configure
bot.tweet(statement)
end
@ -259,8 +310,8 @@ STR
access_token = request_token.get_access_token(oauth_verifier: pin)
log "Account authorized successfully. Make sure to put these in your bots.rb!\n" +
" access token: #{access_token.token}\n" +
" access token secret: #{access_token.secret}"
" bot.access_token = \"#{access_token.token}\"\n" +
" bot.access_token_secret = \"#{access_token.secret}\""
end
HELP.console = <<-STR
@ -275,6 +326,17 @@ STR
require 'pry'; Ebooks.module_exec { pry }
end
HELP.version = <<-STR
Usage: ebooks version
Shows you twitter_ebooks' version number.
STR
def self.version
require File.expand_path('../../lib/twitter_ebooks/version', __FILE__)
log Ebooks::VERSION
end
HELP.start = <<-STR
Usage: ebooks s[tart] [botname]
@ -368,8 +430,10 @@ STR
when "new" then new(args[1])
when "consume" then consume(args[1..-1])
when "consume-all" then consume_all(args[1], args[2..-1])
when "append" then append(args[1],args[2])
when "gen" then gen(args[1], args[2..-1].join(' '))
when "archive" then archive(args[1], args[2])
when "sync" then sync(args[1], args[2])
when "tweet" then tweet(args[1], args[2])
when "jsonify" then jsonify(args[1..-1])
when "auth" then auth
@ -378,6 +442,7 @@ STR
when "start" then start(args[1])
when "s" then start(args[1])
when "help" then help(args[1])
when "version" then version
else
log "No such command '#{args[0]}'"
help

View file

@ -16,6 +16,7 @@ end
require 'twitter_ebooks/nlp'
require 'twitter_ebooks/archive'
require 'twitter_ebooks/sync'
require 'twitter_ebooks/suffix'
require 'twitter_ebooks/model'
require 'twitter_ebooks/bot'

View file

@ -49,8 +49,9 @@ module Ebooks
@client = client || make_client
if File.exists?(@path)
@tweets = JSON.parse(File.read(@path, :encoding => 'utf-8'), symbolize_names: true)
if (File.exists?(@path) && !File.zero?(@path))
@filetext = File.read(@path, :encoding => 'utf-8')
@tweets = JSON.parse(@filetext, symbolize_names: true)
log "Currently #{@tweets.length} tweets for #{@username}"
else
@tweets.nil?
@ -59,6 +60,21 @@ module Ebooks
end
def sync
# We use this structure to ensure that
# a) if there's an issue opening the file, we error out before download
# b) if there's an issue during download we restore the original
File.open(@path, 'w') do |file|
begin
sync_to(file)
rescue Exception
file.seek(0)
file.write(@filetext)
raise
end
end
end
def sync_to(file)
retries = 0
tweets = []
max_id = nil
@ -93,10 +109,8 @@ module Ebooks
@tweets = tweets.map(&:attrs).each { |tw|
tw.delete(:entities)
} + @tweets
File.open(@path, 'w') do |f|
f.write(JSON.pretty_generate(@tweets))
end
end
file.write(JSON.pretty_generate(@tweets))
end
end
end

View file

@ -2,6 +2,14 @@
require 'twitter'
require 'rufus/scheduler'
# Monkeypatch hack to fix upstream dependency issue
# https://github.com/sferik/twitter/issues/709
class HTTP::URI
def port
443 if self.https?
end
end
module Ebooks
class ConfigurationError < Exception
end
@ -29,12 +37,10 @@ module Ebooks
usertweets = @tweets.select { |t| t.user.screen_name.downcase == username.downcase }
if usertweets.length > 2
if (usertweets[-1].created_at - usertweets[-3].created_at) < 10
if username.include?('ebooks') || (usertweets[-1].created_at - usertweets[-3].created_at) < 12
return true
end
end
username.include?("ebooks")
end
# Figure out whether to keep this user in the reply prefix
@ -162,7 +168,7 @@ module Ebooks
# @param username [String]
# @return [Ebooks::Bot]
def self.get(username)
all.find { |bot| bot.username == username }
all.find { |bot| bot.username.downcase == username.downcase }
end
# Logs info to stdout in the context of this bot
@ -262,6 +268,12 @@ module Ebooks
return unless ev.text # If it's not a text-containing tweet, ignore it
return if ev.user.id == @user.id # Ignore our own tweets
if ev.retweet? && ev.retweeted_tweet.user.id == @user.id
# Someone retweeted our tweet!
fire(:retweet, ev)
return
end
meta = meta(ev)
if blacklisted?(ev.user.screen_name)
@ -363,7 +375,7 @@ module Ebooks
# Delay an action for a variable period of time
# @param range [Range, Integer] range of seconds to choose for delay
def delay(range=@delay_range, &b)
time = range.to_a.sample unless range.is_a? Integer
time = rand(range) unless range.is_a? Integer
sleep time
b.call
end

View file

@ -69,6 +69,35 @@ module Ebooks
self
end
# Append a generated model to existing model file instead of overwriting it
# @param path [String]
def append(path)
existing = File.file?(path)
if !existing
log "No existing model found at #{path}"
return
else
#read-in and deserialize existing model
props = Marshal.load(File.open(path,'rb') { |old| old.read })
old_tokens = props[:tokens]
old_sentences = props[:sentences]
old_mentions = props[:mentions]
old_keywords = props[:keywords]
#append existing properties to new ones and overwrite with new model
File.open(path, 'wb') do |f|
f.write(Marshal.dump({
tokens: @tokens.concat(old_tokens),
sentences: @sentences.concat(old_sentences),
mentions: @mentions.concat(old_mentions),
keywords: @keywords.concat(old_keywords)
}))
end
end
self
end
def initialize
@tokens = []
@ -80,7 +109,13 @@ module Ebooks
# @param token [String]
# @return [Integer]
def tikify(token)
@tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1)
if @tikis.has_key?(token) then
return @tikis[token]
else
(@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens"
@tokens << token
return @tikis[token] = @tokens.length-1
end
end
# Convert a body of text into arrays of tikis
@ -143,18 +178,19 @@ module Ebooks
end
end
text = statements.join("\n")
mention_text = mentions.join("\n")
text = statements.join("\n").encode('UTF-8', :invalid => :replace)
mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace)
lines = nil; statements = nil; mentions = nil # Allow garbage collection
log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions"
log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions"
@sentences = mass_tikify(text)
@mentions = mass_tikify(mention_text)
log "Ranking keywords"
@keywords = NLP.keywords(text).top(200).map(&:to_s)
log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}"
self
end
@ -218,14 +254,15 @@ module Ebooks
tweet = ""
while (tikis = generator.generate(3, :bigrams)) do
next if tikis.length <= 3 && !responding
break if valid_tweet?(tikis, limit)
#log "Attempting to produce tweet try #{retries+1}/#{retry_limit}"
break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit)
retries += 1
break if retries >= retry_limit
end
if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident
#log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}"
while (tikis = generator.generate(3, :unigrams)) do
break if valid_tweet?(tikis, limit) && !verbatim?(tikis)

View file

@ -1,6 +1,7 @@
# encoding: utf-8
require 'fast-stemmer'
require 'highscore'
require 'htmlentities'
module Ebooks
module NLP
@ -13,10 +14,10 @@ module Ebooks
# to be using it all of the time
# Lazily loads an array of stopwords
# Stopwords are common English words that should often be ignored
# Stopwords are common words that should often be ignored
# @return [Array<String>]
def self.stopwords
@stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
@stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : []
end
# Lazily loads an array of known English nouns
@ -42,7 +43,6 @@ module Ebooks
# Lazily load HTML entity decoder
# @return [HTMLEntities]
def self.htmlentities
require 'htmlentities'
@htmlentities ||= HTMLEntities.new
end
@ -99,7 +99,7 @@ module Ebooks
#set :vowels, 1 # => default: 0 = not considered
#set :consonants, 5 # => default: 0 = not considered
#set :ignore_case, true # => default: false
set :word_pattern, /(?<!@)(?<=\s)[\w']+/ # => default: /\w+/
set :word_pattern, /(?<!@)(?<=\s)[\p{Word}']+/ # => default: /\w+/
#set :stemming, true # => default: false
end

View file

@ -1,12 +1,15 @@
# encoding: utf-8
module Ebooks
# This generator uses data identical to a markov model, but
# This generator uses data similar to a Markov model, but
# instead of making a chain by looking up bigrams it uses the
# positions to randomly replace suffixes in one sentence with
# matching suffixes in another
# positions to randomly replace token array suffixes in one sentence
# with matching suffixes in another
class SuffixGenerator
# Build a generator from a corpus of tikified sentences
# "tikis" are token indexes-- a way of representing words
# and punctuation as their integer position in a big array
# of such tokens
# @param sentences [Array<Array<Integer>>]
# @return [SuffixGenerator]
def self.build(sentences)
@ -14,11 +17,14 @@ module Ebooks
end
def initialize(sentences)
@sentences = sentences.reject { |s| s.length < 2 }
@sentences = sentences.reject { |s| s.empty? }
@unigrams = {}
@bigrams = {}
@sentences.each_with_index do |tikis, i|
if (i % 10000 == 0) then
log ("Building: sentence #{i} of #{sentences.length}")
end
last_tiki = INTERIM
tikis.each_with_index do |tiki, j|
@unigrams[last_tiki] ||= []
@ -42,7 +48,6 @@ module Ebooks
self
end
# Generate a recombined sequence of tikis
# @param passes [Integer] number of times to recombine
# @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is)
@ -86,7 +91,11 @@ module Ebooks
break if variant
end
tikis = variant if variant
# If we failed to produce a variation from any alternative, there
# is no use running additional passes-- they'll have the same result.
break if variant.nil?
tikis = variant
end
tikis

View file

@ -0,0 +1,52 @@
#!/usr/bin/env ruby
# encoding: utf-8
require 'twitter'
require 'json'
require 'mini_magick'
require 'open-uri'
require 'pry'
module Ebooks
class Sync
def self.run(botname, username)
bot = Ebooks::Bot.get(botname)
bot.configure
source_user = username
ebooks_user = bot.username
user = bot.twitter.user(source_user)
if user.profile_image_url then
Ebooks::Sync::get(user.profile_image_url(:original), "image/#{source_user}_avatar")
avatar = MiniMagick::Image.open("image/#{source_user}_avatar")
avatar.flip
avatar.write("image/#{ebooks_user}_avatar")
avatar64 = Base64.encode64(File.read("image/#{ebooks_user}_avatar"))
bot.twitter.update_profile_image(avatar64)
p "Updated profile image for #{ebooks_user} from #{source_user}."
else
p "#{source_user} does not have a profile image to clone."
end
if user.profile_banner_url then
Ebooks::Sync::get(user.profile_banner_url, "image/#{source_user}banner")
banner = MiniMagick::Image.open("image/#{source_user}banner")
banner.flip
banner.write("image/#{ebooks_user}_banner")
banner64 = Base64.encode64(File.read("image/#{ebooks_user}_banner"))
bot.twitter.update_profile_banner(banner64)
p "Updated cover image for #{ebooks_user} from #{source_user}."
else
p "#{source_user} does not have a cover image to clone."
end
end
def self.get(url, destination)
File.open(destination, "wb") do |saved_file|
open(url, "rb") do |read_file|
saved_file.write(read_file.read)
end
end
end
end
end

View file

@ -1,3 +1,3 @@
module Ebooks
VERSION = "3.1.0"
VERSION = "3.1.6"
end

View file

@ -51,6 +51,11 @@ class MyBot < Ebooks::Bot
# Follow user who just favorited bot's tweet
# follow(user.screen_name)
end
def on_retweet(tweet)
# Follow user who just retweeted bot's tweet
# follow(tweet.user.screen_name)
end
end
# Make a MyBot and attach it to an account

0
skeleton/image/.gitignore vendored Normal file
View file

View file

@ -36,7 +36,7 @@ describe Ebooks::Model do
report2 = MemoryUsage.report do
model = Ebooks::Model.load(file.path)
end
expect(report2.total_memsize).to be < 3000000
expect(report2.total_memsize).to be < 4000000
expect(model.tokens[0]).to be_a String
expect(model.sentences[0][0]).to be_a Fixnum
@ -70,5 +70,19 @@ describe Ebooks::Model do
file.unlink
end
it 'handles strange unicode edge-cases' do
file = Tempfile.new('unicode')
file.write("💞\n💞")
file.close
model = Ebooks::Model.consume(file.path)
expect(model.mentions.count).to eq 0
expect(model.sentences.count).to eq 2
file.unlink
p model.make_statement
end
end
end

View file

@ -1,34 +1,37 @@
# -*- encoding: utf-8 -*-
require File.expand_path('../lib/twitter_ebooks/version', __FILE__)
Gem::Specification.new do |gem|
gem.authors = ["Jaiden Mispy"]
gem.email = ["^_^@mispy.me"]
gem.description = %q{Markov chains for all your friends~}
gem.summary = %q{Markov chains for all your friends~}
gem.homepage = ""
gem.files = `git ls-files`.split($\)
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
gem.name = "twitter_ebooks"
gem.require_paths = ["lib"]
gem.version = Ebooks::VERSION
gem.add_development_dependency 'rspec'
gem.add_development_dependency 'rspec-mocks'
gem.add_development_dependency 'memory_profiler'
gem.add_development_dependency 'timecop'
gem.add_development_dependency 'pry-byebug'
gem.add_development_dependency 'yard'
gem.add_runtime_dependency 'twitter', '~> 5.0'
gem.add_runtime_dependency 'rufus-scheduler'
gem.add_runtime_dependency 'gingerice'
gem.add_runtime_dependency 'htmlentities'
gem.add_runtime_dependency 'engtagger'
gem.add_runtime_dependency 'fast-stemmer'
gem.add_runtime_dependency 'highscore'
gem.add_runtime_dependency 'pry'
gem.add_runtime_dependency 'oauth'
end
# -*- encoding: utf-8 -*-
require File.expand_path('../lib/twitter_ebooks/version', __FILE__)
Gem::Specification.new do |gem|
gem.required_ruby_version = '~> 2.1'
gem.authors = ["Jaiden Mispy"]
gem.email = ["^_^@mispy.me"]
gem.description = %q{Markov chains for all your friends~}
gem.summary = %q{Markov chains for all your friends~}
gem.homepage = ""
gem.files = `git ls-files`.split($\)
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
gem.name = "twitter_ebooks"
gem.require_paths = ["lib"]
gem.version = Ebooks::VERSION
gem.add_development_dependency 'rspec'
gem.add_development_dependency 'rspec-mocks'
gem.add_development_dependency 'memory_profiler'
gem.add_development_dependency 'timecop'
gem.add_development_dependency 'pry-byebug'
gem.add_development_dependency 'yard'
gem.add_runtime_dependency 'twitter', '~> 5.15'
gem.add_runtime_dependency 'rufus-scheduler'
gem.add_runtime_dependency 'gingerice'
gem.add_runtime_dependency 'htmlentities'
gem.add_runtime_dependency 'engtagger'
gem.add_runtime_dependency 'fast-stemmer'
gem.add_runtime_dependency 'highscore'
gem.add_runtime_dependency 'pry'
gem.add_runtime_dependency 'oauth'
gem.add_runtime_dependency 'mini_magick'
end