From 00f0228dd4d58789d215bfd16686643de041921e Mon Sep 17 00:00:00 2001 From: Mispy <^_^@mispy.me> Date: Thu, 14 Nov 2013 07:44:05 -0800 Subject: [PATCH] 2.0.8 -- different generation algorithm --- .gitignore | 0 Gemfile | 0 Gemfile.lock | 6 +-- LICENSE | 0 NOTES.md | 0 README.md | 2 +- Rakefile | 0 bin/ebooks | 6 +-- data/adjectives.txt | 0 data/nouns.txt | 0 data/stopwords.txt | 0 lib/twitter_ebooks.rb | 1 + lib/twitter_ebooks/archiver.rb | 0 lib/twitter_ebooks/bot.rb | 0 lib/twitter_ebooks/markov.rb | 3 +- lib/twitter_ebooks/model.rb | 43 +++++++++++------- lib/twitter_ebooks/nlp.rb | 9 +++- lib/twitter_ebooks/suffix.rb | 82 ++++++++++++++++++++++++++++++++++ lib/twitter_ebooks/version.rb | 2 +- skeleton/.gitignore | 0 skeleton/Procfile | 0 skeleton/bots.rb | 0 test/corpus/0xabad1dea.tweets | 0 twitter_ebooks.gemspec | 0 24 files changed, 127 insertions(+), 27 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 Gemfile mode change 100644 => 100755 Gemfile.lock mode change 100644 => 100755 LICENSE mode change 100644 => 100755 NOTES.md mode change 100644 => 100755 README.md mode change 100644 => 100755 Rakefile mode change 100644 => 100755 data/adjectives.txt mode change 100644 => 100755 data/nouns.txt mode change 100644 => 100755 data/stopwords.txt mode change 100644 => 100755 lib/twitter_ebooks.rb mode change 100644 => 100755 lib/twitter_ebooks/archiver.rb mode change 100644 => 100755 lib/twitter_ebooks/bot.rb mode change 100644 => 100755 lib/twitter_ebooks/markov.rb mode change 100644 => 100755 lib/twitter_ebooks/model.rb mode change 100644 => 100755 lib/twitter_ebooks/nlp.rb create mode 100755 lib/twitter_ebooks/suffix.rb mode change 100644 => 100755 lib/twitter_ebooks/version.rb mode change 100644 => 100755 skeleton/.gitignore mode change 100644 => 100755 skeleton/Procfile mode change 100644 => 100755 skeleton/bots.rb mode change 100644 => 100755 test/corpus/0xabad1dea.tweets mode change 100644 => 100755 twitter_ebooks.gemspec diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/Gemfile b/Gemfile old mode 100644 new mode 100755 diff --git a/Gemfile.lock b/Gemfile.lock old mode 100644 new mode 100755 index 99a4fc6..3c52f77 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,8 +1,7 @@ PATH remote: . specs: - twitter_ebooks (2.0.3) - bloomfilter-rb + twitter_ebooks (2.0.7) engtagger fast-stemmer gingerice @@ -19,8 +18,6 @@ GEM addressable (2.3.5) atomic (1.1.14) awesome_print (1.2.0) - bloomfilter-rb (2.1.1) - redis cookiejar (0.3.0) daemons (1.1.9) em-http-request (1.0.3) @@ -50,7 +47,6 @@ GEM minitest (5.0.8) multi_json (1.8.2) multipart-post (1.2.0) - redis (3.0.5) rufus-scheduler (3.0.2) tzinfo simple_oauth (0.2.0) diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/NOTES.md b/NOTES.md old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 6f6485a..90f584c --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# twitter\_ebooks 2.0.7 +# twitter\_ebooks 2.0.8 Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting. diff --git a/Rakefile b/Rakefile old mode 100644 new mode 100755 diff --git a/bin/ebooks b/bin/ebooks index a65d635..6b86635 100755 --- a/bin/ebooks +++ b/bin/ebooks @@ -46,9 +46,9 @@ module Ebooks def self.gen(model_path, input) model = Model.load(model_path) if input && !input.empty? - puts "@cmd " + model.markov_response(input, 135) + puts "@cmd " + model.make_response(input, 135) else - puts model.markov_statement + puts model.make_statement end end @@ -64,7 +64,7 @@ module Ebooks def self.tweet(modelpath, username) load File.join(APP_PATH, 'bots.rb') model = Model.load(modelpath) - statement = model.markov_statement + statement = model.make_statement log "@#{username}: #{statement}" bot = Bot.get(username) bot.configure diff --git a/data/adjectives.txt b/data/adjectives.txt old mode 100644 new mode 100755 diff --git a/data/nouns.txt b/data/nouns.txt old mode 100644 new mode 100755 diff --git a/data/stopwords.txt b/data/stopwords.txt old mode 100644 new mode 100755 diff --git a/lib/twitter_ebooks.rb b/lib/twitter_ebooks.rb old mode 100644 new mode 100755 index e994b17..a41c4eb --- a/lib/twitter_ebooks.rb +++ b/lib/twitter_ebooks.rb @@ -16,5 +16,6 @@ end require 'twitter_ebooks/nlp' require 'twitter_ebooks/archiver' require 'twitter_ebooks/markov' +require 'twitter_ebooks/suffix' require 'twitter_ebooks/model' require 'twitter_ebooks/bot' diff --git a/lib/twitter_ebooks/archiver.rb b/lib/twitter_ebooks/archiver.rb old mode 100644 new mode 100755 diff --git a/lib/twitter_ebooks/bot.rb b/lib/twitter_ebooks/bot.rb old mode 100644 new mode 100755 diff --git a/lib/twitter_ebooks/markov.rb b/lib/twitter_ebooks/markov.rb old mode 100644 new mode 100755 index 3607b62..ed66fad --- a/lib/twitter_ebooks/markov.rb +++ b/lib/twitter_ebooks/markov.rb @@ -54,9 +54,10 @@ module Ebooks def chain(tokens) if tokens.length == 1 - matches = @unigrams[tokens[0]] + matches = @unigrams[tokens[-1]] else matches = @bigrams[tokens[-2]][tokens[-1]] + matches = @unigrams[tokens[-1]] if matches.length < 2 end if matches.empty? diff --git a/lib/twitter_ebooks/model.rb b/lib/twitter_ebooks/model.rb old mode 100644 new mode 100755 index 4d787b9..ef07c53 --- a/lib/twitter_ebooks/model.rb +++ b/lib/twitter_ebooks/model.rb @@ -7,7 +7,7 @@ require 'digest/md5' module Ebooks class Model - attr_accessor :hash, :sentences, :markov, :keywords + attr_accessor :hash, :sentences, :generator, :keywords def self.consume(txtpath) Model.new.consume(txtpath) @@ -67,16 +67,29 @@ module Ebooks NLP.htmlentities.decode tweet end - def markov_statement(limit=140, markov=nil) - markov ||= MarkovModel.build(@sentences) + def valid_tweet?(tokens, limit) + tweet = NLP.reconstruct(tokens) + tweet.length <= limit && !NLP.unmatched_enclosers?(tweet) + end + + def make_statement(limit=140, generator=nil) + responding = !generator.nil? + generator = SuffixGenerator.build(@sentences) tweet = "" - while (tweet = markov.generate) do - next if tweet.length > limit - next if NLP.unmatched_enclosers?(tweet) - break if tweet.length > limit*0.4 || rand > 0.8 + while (tokens = generator.generate(3, :bigrams)) do + next if tokens.length <= 3 && !responding + break if valid_tweet?(tokens, limit) end + if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident + while (tokens = generator.generate(3, :unigrams)) do + break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens) + end + end + + tweet = NLP.reconstruct(tokens) + fix tweet end @@ -101,19 +114,19 @@ module Ebooks end # Generates a response by looking for related sentences - # in the corpus and building a smaller markov model from these - def markov_response(input, limit=140) + # in the corpus and building a smaller generator from these + def make_response(input, limit=140) # First try relevant, slightly_relevant = relevant_sentences(input) if relevant.length >= 3 - markov = MarkovModel.new.consume(relevant) - markov_statement(limit, markov) - elsif slightly_relevant.length > 5 - markov = MarkovModel.new.consume(slightly_relevant) - markov_statement(limit, markov) + generator = SuffixGenerator.build(relevant) + make_statement(limit, generator) + elsif slightly_relevant.length >= 5 + generator = SuffixGenerator.build(slightly_relevant) + make_statement(limit, generator) else - markov_statement(limit) + make_statement(limit) end end end diff --git a/lib/twitter_ebooks/nlp.rb b/lib/twitter_ebooks/nlp.rb old mode 100644 new mode 100755 index bd8804a..b3262e7 --- a/lib/twitter_ebooks/nlp.rb +++ b/lib/twitter_ebooks/nlp.rb @@ -61,7 +61,7 @@ module Ebooks # As above, this is ad hoc because tokenization libraries # do not behave well wrt. things like emoticons and timestamps def self.tokenize(sentence) - regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/ + regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/ sentence.split(regex) end @@ -150,5 +150,12 @@ module Ebooks false end + + # Determine if a2 is a subsequence of a1 + def self.subseq?(a1, a2) + a1.each_index.find do |i| + a1[i...i+a2.length] == a2 + end + end end end diff --git a/lib/twitter_ebooks/suffix.rb b/lib/twitter_ebooks/suffix.rb new file mode 100755 index 0000000..6a84ac8 --- /dev/null +++ b/lib/twitter_ebooks/suffix.rb @@ -0,0 +1,82 @@ +module Ebooks + class SuffixGenerator + def self.build(sentences) + SuffixGenerator.new(sentences) + end + + def initialize(sentences) + @sentences = sentences.reject { |s| s.length < 2 } + @unigrams = {} + @bigrams = {} + + @sentences.each_with_index do |tokens, i| + last_token = INTERIM + tokens.each_with_index do |token, j| + @unigrams[last_token] ||= [] + @unigrams[last_token] << [i, j] + + @bigrams[last_token] ||= {} + @bigrams[last_token][token] ||= [] + + if j == tokens.length-1 # Mark sentence endings + @unigrams[token] ||= [] + @unigrams[token] << [i, INTERIM] + @bigrams[last_token][token] << [i, INTERIM] + else + @bigrams[last_token][token] << [i, j+1] + end + + last_token = token + end + end + + self + end + + def generate(passes=5, n=:unigrams) + index = rand(@sentences.length) + tokens = @sentences[index] + used = [index] # Sentences we've already used + verbatim = [tokens] # Verbatim sentences to avoid reproducing + + 0.upto(passes-1) do + puts NLP.reconstruct(tokens) + varsites = {} # Map bigram start site => next token alternatives + + tokens.each_with_index do |token, i| + next_token = tokens[i+1] + break if next_token.nil? + + alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token] + alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) } + varsites[i] = alternatives unless alternatives.empty? + end + + variant = nil + varsites.to_a.shuffle.each do |site| + start = site[0] + + site[1].shuffle.each do |alt| + start, alt = site[0], site[1].sample + verbatim << @sentences[alt[0]] + suffix = @sentences[alt[0]][alt[1]..-1] + potential = tokens[0..start+1] + suffix + + unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) } + used << alt[0] + variant = potential + break + end + end + + break if variant + end + + tokens = variant if variant + end + + + tokens + end + end +end diff --git a/lib/twitter_ebooks/version.rb b/lib/twitter_ebooks/version.rb old mode 100644 new mode 100755 index c39a21d..0f2d7b7 --- a/lib/twitter_ebooks/version.rb +++ b/lib/twitter_ebooks/version.rb @@ -1,3 +1,3 @@ module Ebooks - VERSION = "2.0.7" + VERSION = "2.0.8" end diff --git a/skeleton/.gitignore b/skeleton/.gitignore old mode 100644 new mode 100755 diff --git a/skeleton/Procfile b/skeleton/Procfile old mode 100644 new mode 100755 diff --git a/skeleton/bots.rb b/skeleton/bots.rb old mode 100644 new mode 100755 diff --git a/test/corpus/0xabad1dea.tweets b/test/corpus/0xabad1dea.tweets old mode 100644 new mode 100755 diff --git a/twitter_ebooks.gemspec b/twitter_ebooks.gemspec old mode 100644 new mode 100755