Move all the letter frequency stuff to its own module

This commit is contained in:
Eryn Wells 2018-04-07 22:32:01 -07:00
parent 1e1bd0c7c1
commit 0c8ff1686e
3 changed files with 100 additions and 29 deletions

96
src/letter_frequency.rs Normal file
View file

@ -0,0 +1,96 @@
// letter_frequency.rs
// Eryn Wells <eryn@erynwells.me>
use std::collections::{HashMap, HashSet};
use std::iter::FromIterator;
pub type LetterSet = HashSet<String>;
pub type FreqMap = HashMap<String, f32>;
static ENGLISH_LETTERS: &'static str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
static ENGLISH_LETTER_FREQS: &'static [f32] = &[
0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, // A-G
0.06094, 0.06996, 0.00153, 0.00772, 0.04025, 0.02406, 0.06749, // H-N
0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056, 0.02758, // O-U
0.00978, 0.02360, 0.00150, 0.01974, 0.00074 // V-Z
];
pub fn english_letters() -> LetterSet {
LetterSet::from_iter(ENGLISH_LETTERS.chars().map(|c| c.to_string()))
}
pub fn english_letter_freqs() -> FreqMap {
let char_strings = ENGLISH_LETTERS.chars().map(|c| c.to_string());
FreqMap::from_iter(char_strings.zip(ENGLISH_LETTER_FREQS.iter().map(|x| *x)))
}
pub trait LetterFreq {
fn letter_freqs(&self, lang: &str) -> FreqMap;
fn chi2_freqs(&self, lang: &str) -> f32;
}
impl<'a> LetterFreq for &'a str {
fn letter_freqs(&self, lang: &str) -> FreqMap {
assert_eq!(lang, "en", "only 'en' language is supported rn");
let english_letters = english_letters();
let mut freqs = FreqMap::new();
for c in self.chars() {
let c_str = c.to_uppercase().to_string();
if english_letters.contains(&c_str) {
*freqs.entry(c_str).or_insert(0f32) += 1f32;
}
println!("{:?}: {:?}", c, freqs);
}
freqs
}
fn chi2_freqs(&self, lang: &str) -> f32 {
assert_eq!(lang, "en", "only 'en' language is supported rn");
// Calculate chi-squared for this string, comparing actual frequencies vs. English letter frequencies.
// https://en.wikipedia.org/wiki/Letter_frequency
// https://crypto.stackexchange.com/questions/30209/developing-algorithm-for-detecting-plain-text-via-frequency-analysis
let freqs = self.letter_freqs(lang);
let english_freqs = english_letter_freqs();
let num_letters = freqs.values().sum::<f32>();
println!("freqs:{:?}, num:{}", freqs, num_letters);
let score = english_freqs.into_iter()
.map(|(c, sc)| (freqs.get(&c).map_or(0f32, |c| *c), sc * num_letters))
.inspect(|c| println!("{:?}", c))
.fold(0f32, |acc, (obs, exp)| acc + ((obs - exp).powf(2.0) / exp));
println!("chi2 -> {}", score);
score
}
}
impl LetterFreq for String {
fn letter_freqs(&self, lang: &str) -> FreqMap {
self.as_str().letter_freqs(lang)
}
fn chi2_freqs(&self, lang: &str) -> f32 {
self.as_str().chi2_freqs(lang)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn letter_freqs() {
let input = "I am a dog";
let mut expected_freqs = FreqMap::new();
expected_freqs.insert(String::from("I"), 1f32);
expected_freqs.insert(String::from("A"), 2f32);
expected_freqs.insert(String::from("M"), 1f32);
expected_freqs.insert(String::from("D"), 1f32);
expected_freqs.insert(String::from("O"), 1f32);
expected_freqs.insert(String::from("G"), 1f32);
let computed_freqs = input.letter_freqs("en");
assert_eq!(computed_freqs, expected_freqs);
let letters = computed_freqs.values().sum::<f32>();
assert_eq!(letters, 7f32);
}
}

View file

@ -1,3 +1,4 @@
pub mod b64; pub mod b64;
pub mod hex; pub mod hex;
pub mod xor; pub mod xor;
pub mod letter_frequency;

View file

@ -32,8 +32,10 @@ impl<'a, T> SingleByteXORable<T> for T
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use letter_frequency::LetterFreq;
use hex::{HexDecodable, HexEncodable}; use hex::{HexDecodable, HexEncodable};
use std::f32; use std::f32;
use std::ascii::AsciiExt;
#[test] #[test]
fn cryptopals() { fn cryptopals() {
@ -47,34 +49,6 @@ mod tests {
assert_eq!(output, ex_output); assert_eq!(output, ex_output);
} }
static ENGLISH_LETTER_FREQS: &'static [f32] = &[
0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, // A-G
0.06094, 0.06996, 0.00153, 0.00772, 0.04025, 0.02406, 0.06749, // H-N
0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056, 0.02758, // O-U
0.00978, 0.02360, 0.00150, 0.01974, 0.00074 // V-Z
];
fn letter_freq_score(input: &str) -> f32 {
let mut freqs: Vec<f32> = iter::repeat(0f32).take(26).collect();
let mut num_alphabetic_chars = 0f32;
for c in input.chars().filter(char::is_ascii_alphabetic) {
num_alphabetic_chars += 1f32;
let c = c.to_ascii_uppercase();
freqs[c as usize - 'A' as usize] += 1f32;
println!("char:{} freqs:{:?}", c, freqs);
}
println!("freqs:{:?}", freqs);
// Calculate chi-squared for this string, comparing actual frequencies vs. English letter
// frequencies.
// https://en.wikipedia.org/wiki/Letter_frequency
// https://crypto.stackexchange.com/questions/30209/developing-algorithm-for-detecting-plain-text-via-frequency-analysis
let expected_freqs = ENGLISH_LETTER_FREQS.iter().map(|x| x * num_alphabetic_chars);
let score = freqs.into_iter()
.zip(expected_freqs)
.fold(0f32, |acc, (obs, exp)| acc + ((obs - exp).powf(2.0) / exp));
score
}
#[test] #[test]
fn cryptopals13() { fn cryptopals13() {
let input = "1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736"; let input = "1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736";
@ -86,7 +60,7 @@ mod tests {
.single_byte_xor(key) .single_byte_xor(key)
.map(char::from) .map(char::from)
.collect::<String>(); .collect::<String>();
let score = letter_freq_score(&possible_output); let score = possible_output.chi2_freqs("en");
println!("{}: {:?} -> {}", key, possible_output, score); println!("{}: {:?} -> {}", key, possible_output, score);
if !score.is_nan() && score < best_score { if !score.is_nan() && score < best_score {
best_score = score; best_score = score;