2.0.8 -- different generation algorithm
This commit is contained in:
parent
e87dc5862b
commit
00f0228dd4
24 changed files with 127 additions and 27 deletions
0
.gitignore
vendored
Normal file → Executable file
0
.gitignore
vendored
Normal file → Executable file
0
Gemfile
Normal file → Executable file
0
Gemfile
Normal file → Executable file
6
Gemfile.lock
Normal file → Executable file
6
Gemfile.lock
Normal file → Executable file
|
@ -1,8 +1,7 @@
|
|||
PATH
|
||||
remote: .
|
||||
specs:
|
||||
twitter_ebooks (2.0.3)
|
||||
bloomfilter-rb
|
||||
twitter_ebooks (2.0.7)
|
||||
engtagger
|
||||
fast-stemmer
|
||||
gingerice
|
||||
|
@ -19,8 +18,6 @@ GEM
|
|||
addressable (2.3.5)
|
||||
atomic (1.1.14)
|
||||
awesome_print (1.2.0)
|
||||
bloomfilter-rb (2.1.1)
|
||||
redis
|
||||
cookiejar (0.3.0)
|
||||
daemons (1.1.9)
|
||||
em-http-request (1.0.3)
|
||||
|
@ -50,7 +47,6 @@ GEM
|
|||
minitest (5.0.8)
|
||||
multi_json (1.8.2)
|
||||
multipart-post (1.2.0)
|
||||
redis (3.0.5)
|
||||
rufus-scheduler (3.0.2)
|
||||
tzinfo
|
||||
simple_oauth (0.2.0)
|
||||
|
|
0
LICENSE
Normal file → Executable file
0
LICENSE
Normal file → Executable file
0
NOTES.md
Normal file → Executable file
0
NOTES.md
Normal file → Executable file
2
README.md
Normal file → Executable file
2
README.md
Normal file → Executable file
|
@ -1,4 +1,4 @@
|
|||
# twitter\_ebooks 2.0.7
|
||||
# twitter\_ebooks 2.0.8
|
||||
|
||||
Complete rewrite of twitter\_ebooks. Allows context-sensitive responsive bots via the Twitter streaming API, along with higher-quality ngram modeling. Still needs a bit of cleaning and documenting.
|
||||
|
||||
|
|
0
Rakefile
Normal file → Executable file
0
Rakefile
Normal file → Executable file
|
@ -46,9 +46,9 @@ module Ebooks
|
|||
def self.gen(model_path, input)
|
||||
model = Model.load(model_path)
|
||||
if input && !input.empty?
|
||||
puts "@cmd " + model.markov_response(input, 135)
|
||||
puts "@cmd " + model.make_response(input, 135)
|
||||
else
|
||||
puts model.markov_statement
|
||||
puts model.make_statement
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -64,7 +64,7 @@ module Ebooks
|
|||
def self.tweet(modelpath, username)
|
||||
load File.join(APP_PATH, 'bots.rb')
|
||||
model = Model.load(modelpath)
|
||||
statement = model.markov_statement
|
||||
statement = model.make_statement
|
||||
log "@#{username}: #{statement}"
|
||||
bot = Bot.get(username)
|
||||
bot.configure
|
||||
|
|
0
data/adjectives.txt
Normal file → Executable file
0
data/adjectives.txt
Normal file → Executable file
0
data/nouns.txt
Normal file → Executable file
0
data/nouns.txt
Normal file → Executable file
0
data/stopwords.txt
Normal file → Executable file
0
data/stopwords.txt
Normal file → Executable file
1
lib/twitter_ebooks.rb
Normal file → Executable file
1
lib/twitter_ebooks.rb
Normal file → Executable file
|
@ -16,5 +16,6 @@ end
|
|||
require 'twitter_ebooks/nlp'
|
||||
require 'twitter_ebooks/archiver'
|
||||
require 'twitter_ebooks/markov'
|
||||
require 'twitter_ebooks/suffix'
|
||||
require 'twitter_ebooks/model'
|
||||
require 'twitter_ebooks/bot'
|
||||
|
|
0
lib/twitter_ebooks/archiver.rb
Normal file → Executable file
0
lib/twitter_ebooks/archiver.rb
Normal file → Executable file
0
lib/twitter_ebooks/bot.rb
Normal file → Executable file
0
lib/twitter_ebooks/bot.rb
Normal file → Executable file
3
lib/twitter_ebooks/markov.rb
Normal file → Executable file
3
lib/twitter_ebooks/markov.rb
Normal file → Executable file
|
@ -54,9 +54,10 @@ module Ebooks
|
|||
|
||||
def chain(tokens)
|
||||
if tokens.length == 1
|
||||
matches = @unigrams[tokens[0]]
|
||||
matches = @unigrams[tokens[-1]]
|
||||
else
|
||||
matches = @bigrams[tokens[-2]][tokens[-1]]
|
||||
matches = @unigrams[tokens[-1]] if matches.length < 2
|
||||
end
|
||||
|
||||
if matches.empty?
|
||||
|
|
43
lib/twitter_ebooks/model.rb
Normal file → Executable file
43
lib/twitter_ebooks/model.rb
Normal file → Executable file
|
@ -7,7 +7,7 @@ require 'digest/md5'
|
|||
|
||||
module Ebooks
|
||||
class Model
|
||||
attr_accessor :hash, :sentences, :markov, :keywords
|
||||
attr_accessor :hash, :sentences, :generator, :keywords
|
||||
|
||||
def self.consume(txtpath)
|
||||
Model.new.consume(txtpath)
|
||||
|
@ -67,16 +67,29 @@ module Ebooks
|
|||
NLP.htmlentities.decode tweet
|
||||
end
|
||||
|
||||
def markov_statement(limit=140, markov=nil)
|
||||
markov ||= MarkovModel.build(@sentences)
|
||||
def valid_tweet?(tokens, limit)
|
||||
tweet = NLP.reconstruct(tokens)
|
||||
tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
|
||||
end
|
||||
|
||||
def make_statement(limit=140, generator=nil)
|
||||
responding = !generator.nil?
|
||||
generator = SuffixGenerator.build(@sentences)
|
||||
tweet = ""
|
||||
|
||||
while (tweet = markov.generate) do
|
||||
next if tweet.length > limit
|
||||
next if NLP.unmatched_enclosers?(tweet)
|
||||
break if tweet.length > limit*0.4 || rand > 0.8
|
||||
while (tokens = generator.generate(3, :bigrams)) do
|
||||
next if tokens.length <= 3 && !responding
|
||||
break if valid_tweet?(tokens, limit)
|
||||
end
|
||||
|
||||
if @sentences.include?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
|
||||
while (tokens = generator.generate(3, :unigrams)) do
|
||||
break if valid_tweet?(tokens, limit) && !@sentences.include?(tokens)
|
||||
end
|
||||
end
|
||||
|
||||
tweet = NLP.reconstruct(tokens)
|
||||
|
||||
fix tweet
|
||||
end
|
||||
|
||||
|
@ -101,19 +114,19 @@ module Ebooks
|
|||
end
|
||||
|
||||
# Generates a response by looking for related sentences
|
||||
# in the corpus and building a smaller markov model from these
|
||||
def markov_response(input, limit=140)
|
||||
# in the corpus and building a smaller generator from these
|
||||
def make_response(input, limit=140)
|
||||
# First try
|
||||
relevant, slightly_relevant = relevant_sentences(input)
|
||||
|
||||
if relevant.length >= 3
|
||||
markov = MarkovModel.new.consume(relevant)
|
||||
markov_statement(limit, markov)
|
||||
elsif slightly_relevant.length > 5
|
||||
markov = MarkovModel.new.consume(slightly_relevant)
|
||||
markov_statement(limit, markov)
|
||||
generator = SuffixGenerator.build(relevant)
|
||||
make_statement(limit, generator)
|
||||
elsif slightly_relevant.length >= 5
|
||||
generator = SuffixGenerator.build(slightly_relevant)
|
||||
make_statement(limit, generator)
|
||||
else
|
||||
markov_statement(limit)
|
||||
make_statement(limit)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
9
lib/twitter_ebooks/nlp.rb
Normal file → Executable file
9
lib/twitter_ebooks/nlp.rb
Normal file → Executable file
|
@ -61,7 +61,7 @@ module Ebooks
|
|||
# As above, this is ad hoc because tokenization libraries
|
||||
# do not behave well wrt. things like emoticons and timestamps
|
||||
def self.tokenize(sentence)
|
||||
regex = /\s+|(?<=[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+)/
|
||||
regex = /\s+|(?<=[#{PUNCTUATION}]\s)(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+\s)/
|
||||
sentence.split(regex)
|
||||
end
|
||||
|
||||
|
@ -150,5 +150,12 @@ module Ebooks
|
|||
|
||||
false
|
||||
end
|
||||
|
||||
# Determine if a2 is a subsequence of a1
|
||||
def self.subseq?(a1, a2)
|
||||
a1.each_index.find do |i|
|
||||
a1[i...i+a2.length] == a2
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
82
lib/twitter_ebooks/suffix.rb
Executable file
82
lib/twitter_ebooks/suffix.rb
Executable file
|
@ -0,0 +1,82 @@
|
|||
module Ebooks
|
||||
class SuffixGenerator
|
||||
def self.build(sentences)
|
||||
SuffixGenerator.new(sentences)
|
||||
end
|
||||
|
||||
def initialize(sentences)
|
||||
@sentences = sentences.reject { |s| s.length < 2 }
|
||||
@unigrams = {}
|
||||
@bigrams = {}
|
||||
|
||||
@sentences.each_with_index do |tokens, i|
|
||||
last_token = INTERIM
|
||||
tokens.each_with_index do |token, j|
|
||||
@unigrams[last_token] ||= []
|
||||
@unigrams[last_token] << [i, j]
|
||||
|
||||
@bigrams[last_token] ||= {}
|
||||
@bigrams[last_token][token] ||= []
|
||||
|
||||
if j == tokens.length-1 # Mark sentence endings
|
||||
@unigrams[token] ||= []
|
||||
@unigrams[token] << [i, INTERIM]
|
||||
@bigrams[last_token][token] << [i, INTERIM]
|
||||
else
|
||||
@bigrams[last_token][token] << [i, j+1]
|
||||
end
|
||||
|
||||
last_token = token
|
||||
end
|
||||
end
|
||||
|
||||
self
|
||||
end
|
||||
|
||||
def generate(passes=5, n=:unigrams)
|
||||
index = rand(@sentences.length)
|
||||
tokens = @sentences[index]
|
||||
used = [index] # Sentences we've already used
|
||||
verbatim = [tokens] # Verbatim sentences to avoid reproducing
|
||||
|
||||
0.upto(passes-1) do
|
||||
puts NLP.reconstruct(tokens)
|
||||
varsites = {} # Map bigram start site => next token alternatives
|
||||
|
||||
tokens.each_with_index do |token, i|
|
||||
next_token = tokens[i+1]
|
||||
break if next_token.nil?
|
||||
|
||||
alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
|
||||
alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
|
||||
varsites[i] = alternatives unless alternatives.empty?
|
||||
end
|
||||
|
||||
variant = nil
|
||||
varsites.to_a.shuffle.each do |site|
|
||||
start = site[0]
|
||||
|
||||
site[1].shuffle.each do |alt|
|
||||
start, alt = site[0], site[1].sample
|
||||
verbatim << @sentences[alt[0]]
|
||||
suffix = @sentences[alt[0]][alt[1]..-1]
|
||||
potential = tokens[0..start+1] + suffix
|
||||
|
||||
unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
|
||||
used << alt[0]
|
||||
variant = potential
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
break if variant
|
||||
end
|
||||
|
||||
tokens = variant if variant
|
||||
end
|
||||
|
||||
|
||||
tokens
|
||||
end
|
||||
end
|
||||
end
|
2
lib/twitter_ebooks/version.rb
Normal file → Executable file
2
lib/twitter_ebooks/version.rb
Normal file → Executable file
|
@ -1,3 +1,3 @@
|
|||
module Ebooks
|
||||
VERSION = "2.0.7"
|
||||
VERSION = "2.0.8"
|
||||
end
|
||||
|
|
0
skeleton/.gitignore
vendored
Normal file → Executable file
0
skeleton/.gitignore
vendored
Normal file → Executable file
0
skeleton/Procfile
Normal file → Executable file
0
skeleton/Procfile
Normal file → Executable file
0
skeleton/bots.rb
Normal file → Executable file
0
skeleton/bots.rb
Normal file → Executable file
0
test/corpus/0xabad1dea.tweets
Normal file → Executable file
0
test/corpus/0xabad1dea.tweets
Normal file → Executable file
0
twitter_ebooks.gemspec
Normal file → Executable file
0
twitter_ebooks.gemspec
Normal file → Executable file
Loading…
Add table
Add a link
Reference in a new issue