diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..e895a20 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Require \n style line endings +* text eol=lf diff --git a/.travis.yml b/.travis.yml index df0c8fd..e31cc43 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ rvm: - - 2.1.4 + - 2.1.7 script: - rspec spec notifications: diff --git a/README.md b/README.md index c8f135c..07d1c2b 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Note that 3.0 is not backwards compatible with 2.x, so upgrade carefully! In par ## Installation -Requires Ruby 2.0+ +Requires Ruby 2.1+. Ruby 2.3+ is recommended. ```bash gem install twitter_ebooks @@ -78,6 +78,16 @@ class MyBot < Ebooks::Bot # Reply to a tweet in the bot's timeline # reply(tweet, meta(tweet).reply_prefix + "nice tweet") end + + def on_favorite(user, tweet) + # Follow user who just favorited bot's tweet + # follow(user.screen_name) + end + + def on_retweet(tweet) + # Follow user who just retweeted bot's tweet + # follow(tweet.user.screen_name) + end end # Make a MyBot and attach it to an account @@ -135,10 +145,10 @@ The secondary function is the "interesting keywords" list. For example, I use th ``` ruby top100 = model.keywords.take(100) -tokens = Ebooks::NLP.tokenize(tweet[:text]) +tokens = Ebooks::NLP.tokenize(tweet.text) if tokens.find { |t| top100.include?(t) } - bot.favorite(tweet[:id]) + favorite(tweet) end ``` diff --git a/bin/ebooks b/bin/ebooks index 24a5120..9f2fc70 100755 --- a/bin/ebooks +++ b/bin/ebooks @@ -25,9 +25,12 @@ Usage: ebooks auth ebooks consume [corpus_path2] [...] ebooks consume-all [corpus_path2] [...] + ebooks append ebooks gen [input] ebooks archive [path] + ebooks sync [username] ebooks tweet + ebooks version STR def self.help(command=nil) @@ -91,7 +94,9 @@ STR filename = File.basename(path) shortname = filename.split('.')[0..-2].join('.') + FileUtils.mkdir_p(File.join(APP_PATH, 'model')) outpath = File.join(APP_PATH, 'model', "#{shortname}.model") + Ebooks::Model.consume(path).save(outpath) log "Corpus consumed to #{outpath}" end @@ -115,6 +120,24 @@ STR log "Corpuses consumed to #{outpath}" end + HELP.append = <<-STR + Usage: ebooks append + + Process then append the provided corpus to the model + instead of overwriting. + STR + + def self.append(name, path) + if !name || !path + help :append + exit 1 + end + + Ebooks::Model.consume(path).append(File.join(APP_PATH,'model',"#{name}.model")) + log "Corpus appended to #{name}.model" + end + + HELP.jsonify = <<-STR Usage: ebooks jsonify [tweets.csv2] [...] @@ -189,6 +212,11 @@ STR Output defaults to corpus/.json Due to API limitations, this can only receive up to ~3000 tweets into the past. + + The first time you run archive, you will need to enter the auth + details of some account to use for accessing the API. This info + will then be stored in ~/.ebooksrc for later use, and can be + modified there if needed. STR def self.archive(username, outpath=nil) @@ -200,6 +228,25 @@ STR Ebooks::Archive.new(username, outpath).sync end + HELP.sync = <<-STR + Usage: ebooks sync + + Copies and flips 's avatar and cover photo, uploading them to 's profile. + + Stores saved avatar's and covers in image/. + + STR + + def self.sync(botname, username) + if botname.nil? + help :sync + exit 1 + end + + load File.join(APP_PATH, 'bots.rb') + Ebooks::Sync::run(botname, username) + end + HELP.tweet = <<-STR Usage: ebooks tweet @@ -217,6 +264,10 @@ STR model = Ebooks::Model.load(modelpath) statement = model.make_statement bot = Ebooks::Bot.get(botname) + if bot.nil? + log "No such bot configured in bots.rb: #{botname}" + exit 1 + end bot.configure bot.tweet(statement) end @@ -259,8 +310,8 @@ STR access_token = request_token.get_access_token(oauth_verifier: pin) log "Account authorized successfully. Make sure to put these in your bots.rb!\n" + - " access token: #{access_token.token}\n" + - " access token secret: #{access_token.secret}" + " bot.access_token = \"#{access_token.token}\"\n" + + " bot.access_token_secret = \"#{access_token.secret}\"" end HELP.console = <<-STR @@ -275,6 +326,17 @@ STR require 'pry'; Ebooks.module_exec { pry } end + HELP.version = <<-STR + Usage: ebooks version + + Shows you twitter_ebooks' version number. + STR + + def self.version + require File.expand_path('../../lib/twitter_ebooks/version', __FILE__) + log Ebooks::VERSION + end + HELP.start = <<-STR Usage: ebooks s[tart] [botname] @@ -368,8 +430,10 @@ STR when "new" then new(args[1]) when "consume" then consume(args[1..-1]) when "consume-all" then consume_all(args[1], args[2..-1]) + when "append" then append(args[1],args[2]) when "gen" then gen(args[1], args[2..-1].join(' ')) when "archive" then archive(args[1], args[2]) + when "sync" then sync(args[1], args[2]) when "tweet" then tweet(args[1], args[2]) when "jsonify" then jsonify(args[1..-1]) when "auth" then auth @@ -378,6 +442,7 @@ STR when "start" then start(args[1]) when "s" then start(args[1]) when "help" then help(args[1]) + when "version" then version else log "No such command '#{args[0]}'" help diff --git a/lib/twitter_ebooks.rb b/lib/twitter_ebooks.rb index 9d9652f..f086418 100644 --- a/lib/twitter_ebooks.rb +++ b/lib/twitter_ebooks.rb @@ -16,6 +16,7 @@ end require 'twitter_ebooks/nlp' require 'twitter_ebooks/archive' +require 'twitter_ebooks/sync' require 'twitter_ebooks/suffix' require 'twitter_ebooks/model' require 'twitter_ebooks/bot' diff --git a/lib/twitter_ebooks/archive.rb b/lib/twitter_ebooks/archive.rb index 29829e2..d0776e3 100644 --- a/lib/twitter_ebooks/archive.rb +++ b/lib/twitter_ebooks/archive.rb @@ -49,8 +49,9 @@ module Ebooks @client = client || make_client - if File.exists?(@path) - @tweets = JSON.parse(File.read(@path, :encoding => 'utf-8'), symbolize_names: true) + if (File.exists?(@path) && !File.zero?(@path)) + @filetext = File.read(@path, :encoding => 'utf-8') + @tweets = JSON.parse(@filetext, symbolize_names: true) log "Currently #{@tweets.length} tweets for #{@username}" else @tweets.nil? @@ -59,6 +60,21 @@ module Ebooks end def sync + # We use this structure to ensure that + # a) if there's an issue opening the file, we error out before download + # b) if there's an issue during download we restore the original + File.open(@path, 'w') do |file| + begin + sync_to(file) + rescue Exception + file.seek(0) + file.write(@filetext) + raise + end + end + end + + def sync_to(file) retries = 0 tweets = [] max_id = nil @@ -93,10 +109,8 @@ module Ebooks @tweets = tweets.map(&:attrs).each { |tw| tw.delete(:entities) } + @tweets - File.open(@path, 'w') do |f| - f.write(JSON.pretty_generate(@tweets)) - end end + file.write(JSON.pretty_generate(@tweets)) end end end diff --git a/lib/twitter_ebooks/bot.rb b/lib/twitter_ebooks/bot.rb index 29e6814..64b0246 100644 --- a/lib/twitter_ebooks/bot.rb +++ b/lib/twitter_ebooks/bot.rb @@ -2,6 +2,14 @@ require 'twitter' require 'rufus/scheduler' +# Monkeypatch hack to fix upstream dependency issue +# https://github.com/sferik/twitter/issues/709 +class HTTP::URI + def port + 443 if self.https? + end +end + module Ebooks class ConfigurationError < Exception end @@ -29,12 +37,10 @@ module Ebooks usertweets = @tweets.select { |t| t.user.screen_name.downcase == username.downcase } if usertweets.length > 2 - if (usertweets[-1].created_at - usertweets[-3].created_at) < 10 + if username.include?('ebooks') || (usertweets[-1].created_at - usertweets[-3].created_at) < 12 return true end end - - username.include?("ebooks") end # Figure out whether to keep this user in the reply prefix @@ -162,7 +168,7 @@ module Ebooks # @param username [String] # @return [Ebooks::Bot] def self.get(username) - all.find { |bot| bot.username == username } + all.find { |bot| bot.username.downcase == username.downcase } end # Logs info to stdout in the context of this bot @@ -262,6 +268,12 @@ module Ebooks return unless ev.text # If it's not a text-containing tweet, ignore it return if ev.user.id == @user.id # Ignore our own tweets + if ev.retweet? && ev.retweeted_tweet.user.id == @user.id + # Someone retweeted our tweet! + fire(:retweet, ev) + return + end + meta = meta(ev) if blacklisted?(ev.user.screen_name) @@ -363,7 +375,7 @@ module Ebooks # Delay an action for a variable period of time # @param range [Range, Integer] range of seconds to choose for delay def delay(range=@delay_range, &b) - time = range.to_a.sample unless range.is_a? Integer + time = rand(range) unless range.is_a? Integer sleep time b.call end diff --git a/lib/twitter_ebooks/model.rb b/lib/twitter_ebooks/model.rb index b3bbb13..cfb56a5 100644 --- a/lib/twitter_ebooks/model.rb +++ b/lib/twitter_ebooks/model.rb @@ -69,6 +69,35 @@ module Ebooks self end + # Append a generated model to existing model file instead of overwriting it + # @param path [String] + def append(path) + existing = File.file?(path) + if !existing + log "No existing model found at #{path}" + return + else + #read-in and deserialize existing model + props = Marshal.load(File.open(path,'rb') { |old| old.read }) + old_tokens = props[:tokens] + old_sentences = props[:sentences] + old_mentions = props[:mentions] + old_keywords = props[:keywords] + + #append existing properties to new ones and overwrite with new model + File.open(path, 'wb') do |f| + f.write(Marshal.dump({ + tokens: @tokens.concat(old_tokens), + sentences: @sentences.concat(old_sentences), + mentions: @mentions.concat(old_mentions), + keywords: @keywords.concat(old_keywords) + })) + end + end + self + end + + def initialize @tokens = [] @@ -80,7 +109,13 @@ module Ebooks # @param token [String] # @return [Integer] def tikify(token) - @tikis[token] or (@tokens << token and @tikis[token] = @tokens.length-1) + if @tikis.has_key?(token) then + return @tikis[token] + else + (@tokens.length+1)%1000 == 0 and puts "#{@tokens.length+1} tokens" + @tokens << token + return @tikis[token] = @tokens.length-1 + end end # Convert a body of text into arrays of tikis @@ -143,18 +178,19 @@ module Ebooks end end - text = statements.join("\n") - mention_text = mentions.join("\n") + text = statements.join("\n").encode('UTF-8', :invalid => :replace) + mention_text = mentions.join("\n").encode('UTF-8', :invalid => :replace) lines = nil; statements = nil; mentions = nil # Allow garbage collection - log "Tokenizing #{text.count('\n')} statements and #{mention_text.count('\n')} mentions" + log "Tokenizing #{text.count("\n")} statements and #{mention_text.count("\n")} mentions" @sentences = mass_tikify(text) @mentions = mass_tikify(mention_text) log "Ranking keywords" @keywords = NLP.keywords(text).top(200).map(&:to_s) + log "Top keywords: #{@keywords[0]} #{@keywords[1]} #{@keywords[2]}" self end @@ -218,14 +254,15 @@ module Ebooks tweet = "" while (tikis = generator.generate(3, :bigrams)) do - next if tikis.length <= 3 && !responding - break if valid_tweet?(tikis, limit) + #log "Attempting to produce tweet try #{retries+1}/#{retry_limit}" + break if (tikis.length > 3 || responding) && valid_tweet?(tikis, limit) retries += 1 break if retries >= retry_limit end if verbatim?(tikis) && tikis.length > 3 # We made a verbatim tweet by accident + #log "Attempting to produce unigram tweet try #{retries+1}/#{retry_limit}" while (tikis = generator.generate(3, :unigrams)) do break if valid_tweet?(tikis, limit) && !verbatim?(tikis) diff --git a/lib/twitter_ebooks/nlp.rb b/lib/twitter_ebooks/nlp.rb index 541720b..f971f0c 100644 --- a/lib/twitter_ebooks/nlp.rb +++ b/lib/twitter_ebooks/nlp.rb @@ -1,6 +1,7 @@ # encoding: utf-8 require 'fast-stemmer' require 'highscore' +require 'htmlentities' module Ebooks module NLP @@ -13,10 +14,10 @@ module Ebooks # to be using it all of the time # Lazily loads an array of stopwords - # Stopwords are common English words that should often be ignored + # Stopwords are common words that should often be ignored # @return [Array] def self.stopwords - @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split + @stopwords ||= File.exists?('stopwords.txt') ? File.read('stopwords.txt').split : [] end # Lazily loads an array of known English nouns @@ -42,7 +43,6 @@ module Ebooks # Lazily load HTML entity decoder # @return [HTMLEntities] def self.htmlentities - require 'htmlentities' @htmlentities ||= HTMLEntities.new end @@ -99,7 +99,7 @@ module Ebooks #set :vowels, 1 # => default: 0 = not considered #set :consonants, 5 # => default: 0 = not considered #set :ignore_case, true # => default: false - set :word_pattern, /(? default: /\w+/ + set :word_pattern, /(? default: /\w+/ #set :stemming, true # => default: false end diff --git a/lib/twitter_ebooks/suffix.rb b/lib/twitter_ebooks/suffix.rb index ff57f97..5c250c0 100644 --- a/lib/twitter_ebooks/suffix.rb +++ b/lib/twitter_ebooks/suffix.rb @@ -1,12 +1,15 @@ # encoding: utf-8 module Ebooks - # This generator uses data identical to a markov model, but + # This generator uses data similar to a Markov model, but # instead of making a chain by looking up bigrams it uses the - # positions to randomly replace suffixes in one sentence with - # matching suffixes in another + # positions to randomly replace token array suffixes in one sentence + # with matching suffixes in another class SuffixGenerator # Build a generator from a corpus of tikified sentences + # "tikis" are token indexes-- a way of representing words + # and punctuation as their integer position in a big array + # of such tokens # @param sentences [Array>] # @return [SuffixGenerator] def self.build(sentences) @@ -14,11 +17,14 @@ module Ebooks end def initialize(sentences) - @sentences = sentences.reject { |s| s.length < 2 } + @sentences = sentences.reject { |s| s.empty? } @unigrams = {} @bigrams = {} @sentences.each_with_index do |tikis, i| + if (i % 10000 == 0) then + log ("Building: sentence #{i} of #{sentences.length}") + end last_tiki = INTERIM tikis.each_with_index do |tiki, j| @unigrams[last_tiki] ||= [] @@ -42,7 +48,6 @@ module Ebooks self end - # Generate a recombined sequence of tikis # @param passes [Integer] number of times to recombine # @param n [Symbol] :unigrams or :bigrams (affects how conservative the model is) @@ -86,7 +91,11 @@ module Ebooks break if variant end - tikis = variant if variant + # If we failed to produce a variation from any alternative, there + # is no use running additional passes-- they'll have the same result. + break if variant.nil? + + tikis = variant end tikis diff --git a/lib/twitter_ebooks/sync.rb b/lib/twitter_ebooks/sync.rb new file mode 100644 index 0000000..b3f2322 --- /dev/null +++ b/lib/twitter_ebooks/sync.rb @@ -0,0 +1,52 @@ +#!/usr/bin/env ruby +# encoding: utf-8 + +require 'twitter' +require 'json' +require 'mini_magick' +require 'open-uri' +require 'pry' + +module Ebooks + class Sync + + def self.run(botname, username) + bot = Ebooks::Bot.get(botname) + bot.configure + source_user = username + ebooks_user = bot.username + user = bot.twitter.user(source_user) + if user.profile_image_url then + Ebooks::Sync::get(user.profile_image_url(:original), "image/#{source_user}_avatar") + avatar = MiniMagick::Image.open("image/#{source_user}_avatar") + avatar.flip + avatar.write("image/#{ebooks_user}_avatar") + avatar64 = Base64.encode64(File.read("image/#{ebooks_user}_avatar")) + bot.twitter.update_profile_image(avatar64) + p "Updated profile image for #{ebooks_user} from #{source_user}." + else + p "#{source_user} does not have a profile image to clone." + end + if user.profile_banner_url then + Ebooks::Sync::get(user.profile_banner_url, "image/#{source_user}banner") + banner = MiniMagick::Image.open("image/#{source_user}banner") + banner.flip + banner.write("image/#{ebooks_user}_banner") + banner64 = Base64.encode64(File.read("image/#{ebooks_user}_banner")) + bot.twitter.update_profile_banner(banner64) + p "Updated cover image for #{ebooks_user} from #{source_user}." + else + p "#{source_user} does not have a cover image to clone." + end + end + + def self.get(url, destination) + File.open(destination, "wb") do |saved_file| + open(url, "rb") do |read_file| + saved_file.write(read_file.read) + end + end + end + + end +end diff --git a/lib/twitter_ebooks/version.rb b/lib/twitter_ebooks/version.rb index fe65e50..b4a936d 100644 --- a/lib/twitter_ebooks/version.rb +++ b/lib/twitter_ebooks/version.rb @@ -1,3 +1,3 @@ module Ebooks - VERSION = "3.1.0" + VERSION = "3.1.6" end diff --git a/skeleton/bots.rb b/skeleton/bots.rb index 1d1f313..12cdfa7 100644 --- a/skeleton/bots.rb +++ b/skeleton/bots.rb @@ -51,6 +51,11 @@ class MyBot < Ebooks::Bot # Follow user who just favorited bot's tweet # follow(user.screen_name) end + + def on_retweet(tweet) + # Follow user who just retweeted bot's tweet + # follow(tweet.user.screen_name) + end end # Make a MyBot and attach it to an account diff --git a/skeleton/image/.gitignore b/skeleton/image/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/data/stopwords.txt b/skeleton/stopwords.txt similarity index 100% rename from data/stopwords.txt rename to skeleton/stopwords.txt diff --git a/spec/model_spec.rb b/spec/model_spec.rb index a8a85f7..5ff9f39 100644 --- a/spec/model_spec.rb +++ b/spec/model_spec.rb @@ -36,7 +36,7 @@ describe Ebooks::Model do report2 = MemoryUsage.report do model = Ebooks::Model.load(file.path) end - expect(report2.total_memsize).to be < 3000000 + expect(report2.total_memsize).to be < 4000000 expect(model.tokens[0]).to be_a String expect(model.sentences[0][0]).to be_a Fixnum @@ -70,5 +70,19 @@ describe Ebooks::Model do file.unlink end + + it 'handles strange unicode edge-cases' do + file = Tempfile.new('unicode') + file.write("šŸ’ž\nšŸ’ž") + file.close + + model = Ebooks::Model.consume(file.path) + expect(model.mentions.count).to eq 0 + expect(model.sentences.count).to eq 2 + + file.unlink + + p model.make_statement + end end end diff --git a/twitter_ebooks.gemspec b/twitter_ebooks.gemspec index 9e2550a..3f3508c 100644 --- a/twitter_ebooks.gemspec +++ b/twitter_ebooks.gemspec @@ -1,34 +1,37 @@ -# -*- encoding: utf-8 -*- -require File.expand_path('../lib/twitter_ebooks/version', __FILE__) - -Gem::Specification.new do |gem| - gem.authors = ["Jaiden Mispy"] - gem.email = ["^_^@mispy.me"] - gem.description = %q{Markov chains for all your friends~} - gem.summary = %q{Markov chains for all your friends~} - gem.homepage = "" - - gem.files = `git ls-files`.split($\) - gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } - gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) - gem.name = "twitter_ebooks" - gem.require_paths = ["lib"] - gem.version = Ebooks::VERSION - - gem.add_development_dependency 'rspec' - gem.add_development_dependency 'rspec-mocks' - gem.add_development_dependency 'memory_profiler' - gem.add_development_dependency 'timecop' - gem.add_development_dependency 'pry-byebug' - gem.add_development_dependency 'yard' - - gem.add_runtime_dependency 'twitter', '~> 5.0' - gem.add_runtime_dependency 'rufus-scheduler' - gem.add_runtime_dependency 'gingerice' - gem.add_runtime_dependency 'htmlentities' - gem.add_runtime_dependency 'engtagger' - gem.add_runtime_dependency 'fast-stemmer' - gem.add_runtime_dependency 'highscore' - gem.add_runtime_dependency 'pry' - gem.add_runtime_dependency 'oauth' -end +# -*- encoding: utf-8 -*- +require File.expand_path('../lib/twitter_ebooks/version', __FILE__) + +Gem::Specification.new do |gem| + gem.required_ruby_version = '~> 2.1' + + gem.authors = ["Jaiden Mispy"] + gem.email = ["^_^@mispy.me"] + gem.description = %q{Markov chains for all your friends~} + gem.summary = %q{Markov chains for all your friends~} + gem.homepage = "" + + gem.files = `git ls-files`.split($\) + gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) } + gem.test_files = gem.files.grep(%r{^(test|spec|features)/}) + gem.name = "twitter_ebooks" + gem.require_paths = ["lib"] + gem.version = Ebooks::VERSION + + gem.add_development_dependency 'rspec' + gem.add_development_dependency 'rspec-mocks' + gem.add_development_dependency 'memory_profiler' + gem.add_development_dependency 'timecop' + gem.add_development_dependency 'pry-byebug' + gem.add_development_dependency 'yard' + + gem.add_runtime_dependency 'twitter', '~> 5.15' + gem.add_runtime_dependency 'rufus-scheduler' + gem.add_runtime_dependency 'gingerice' + gem.add_runtime_dependency 'htmlentities' + gem.add_runtime_dependency 'engtagger' + gem.add_runtime_dependency 'fast-stemmer' + gem.add_runtime_dependency 'highscore' + gem.add_runtime_dependency 'pry' + gem.add_runtime_dependency 'oauth' + gem.add_runtime_dependency 'mini_magick' +end