From 0c8ff1686eb238a0ef2f0979dad380a5f62908c4 Mon Sep 17 00:00:00 2001 From: Eryn Wells Date: Sat, 7 Apr 2018 22:32:01 -0700 Subject: [PATCH] Move all the letter frequency stuff to its own module --- src/letter_frequency.rs | 96 +++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/xor.rs | 32 ++------------ 3 files changed, 100 insertions(+), 29 deletions(-) create mode 100644 src/letter_frequency.rs diff --git a/src/letter_frequency.rs b/src/letter_frequency.rs new file mode 100644 index 0000000..cdb6f71 --- /dev/null +++ b/src/letter_frequency.rs @@ -0,0 +1,96 @@ +// letter_frequency.rs +// Eryn Wells + +use std::collections::{HashMap, HashSet}; +use std::iter::FromIterator; + +pub type LetterSet = HashSet; +pub type FreqMap = HashMap; + +static ENGLISH_LETTERS: &'static str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +static ENGLISH_LETTER_FREQS: &'static [f32] = &[ + 0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, // A-G + 0.06094, 0.06996, 0.00153, 0.00772, 0.04025, 0.02406, 0.06749, // H-N + 0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056, 0.02758, // O-U + 0.00978, 0.02360, 0.00150, 0.01974, 0.00074 // V-Z +]; + +pub fn english_letters() -> LetterSet { + LetterSet::from_iter(ENGLISH_LETTERS.chars().map(|c| c.to_string())) +} + +pub fn english_letter_freqs() -> FreqMap { + let char_strings = ENGLISH_LETTERS.chars().map(|c| c.to_string()); + FreqMap::from_iter(char_strings.zip(ENGLISH_LETTER_FREQS.iter().map(|x| *x))) +} + +pub trait LetterFreq { + fn letter_freqs(&self, lang: &str) -> FreqMap; + fn chi2_freqs(&self, lang: &str) -> f32; +} + +impl<'a> LetterFreq for &'a str { + fn letter_freqs(&self, lang: &str) -> FreqMap { + assert_eq!(lang, "en", "only 'en' language is supported rn"); + let english_letters = english_letters(); + let mut freqs = FreqMap::new(); + for c in self.chars() { + let c_str = c.to_uppercase().to_string(); + if english_letters.contains(&c_str) { + *freqs.entry(c_str).or_insert(0f32) += 1f32; + } + println!("{:?}: {:?}", c, freqs); + } + freqs + } + + fn chi2_freqs(&self, lang: &str) -> f32 { + assert_eq!(lang, "en", "only 'en' language is supported rn"); + // Calculate chi-squared for this string, comparing actual frequencies vs. English letter frequencies. + // https://en.wikipedia.org/wiki/Letter_frequency + // https://crypto.stackexchange.com/questions/30209/developing-algorithm-for-detecting-plain-text-via-frequency-analysis + let freqs = self.letter_freqs(lang); + let english_freqs = english_letter_freqs(); + let num_letters = freqs.values().sum::(); + println!("freqs:{:?}, num:{}", freqs, num_letters); + let score = english_freqs.into_iter() + .map(|(c, sc)| (freqs.get(&c).map_or(0f32, |c| *c), sc * num_letters)) + .inspect(|c| println!("{:?}", c)) + .fold(0f32, |acc, (obs, exp)| acc + ((obs - exp).powf(2.0) / exp)); + println!("chi2 -> {}", score); + score + } +} + +impl LetterFreq for String { + fn letter_freqs(&self, lang: &str) -> FreqMap { + self.as_str().letter_freqs(lang) + } + + fn chi2_freqs(&self, lang: &str) -> f32 { + self.as_str().chi2_freqs(lang) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn letter_freqs() { + let input = "I am a dog"; + let mut expected_freqs = FreqMap::new(); + expected_freqs.insert(String::from("I"), 1f32); + expected_freqs.insert(String::from("A"), 2f32); + expected_freqs.insert(String::from("M"), 1f32); + expected_freqs.insert(String::from("D"), 1f32); + expected_freqs.insert(String::from("O"), 1f32); + expected_freqs.insert(String::from("G"), 1f32); + + let computed_freqs = input.letter_freqs("en"); + assert_eq!(computed_freqs, expected_freqs); + + let letters = computed_freqs.values().sum::(); + assert_eq!(letters, 7f32); + } +} diff --git a/src/lib.rs b/src/lib.rs index b5dba51..4ddcf34 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ pub mod b64; pub mod hex; pub mod xor; +pub mod letter_frequency; diff --git a/src/xor.rs b/src/xor.rs index 63bf42c..36ac23f 100644 --- a/src/xor.rs +++ b/src/xor.rs @@ -32,8 +32,10 @@ impl<'a, T> SingleByteXORable for T #[cfg(test)] mod tests { use super::*; + use letter_frequency::LetterFreq; use hex::{HexDecodable, HexEncodable}; use std::f32; + use std::ascii::AsciiExt; #[test] fn cryptopals() { @@ -47,34 +49,6 @@ mod tests { assert_eq!(output, ex_output); } - static ENGLISH_LETTER_FREQS: &'static [f32] = &[ - 0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, // A-G - 0.06094, 0.06996, 0.00153, 0.00772, 0.04025, 0.02406, 0.06749, // H-N - 0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056, 0.02758, // O-U - 0.00978, 0.02360, 0.00150, 0.01974, 0.00074 // V-Z - ]; - - fn letter_freq_score(input: &str) -> f32 { - let mut freqs: Vec = iter::repeat(0f32).take(26).collect(); - let mut num_alphabetic_chars = 0f32; - for c in input.chars().filter(char::is_ascii_alphabetic) { - num_alphabetic_chars += 1f32; - let c = c.to_ascii_uppercase(); - freqs[c as usize - 'A' as usize] += 1f32; - println!("char:{} freqs:{:?}", c, freqs); - } - println!("freqs:{:?}", freqs); - // Calculate chi-squared for this string, comparing actual frequencies vs. English letter - // frequencies. - // https://en.wikipedia.org/wiki/Letter_frequency - // https://crypto.stackexchange.com/questions/30209/developing-algorithm-for-detecting-plain-text-via-frequency-analysis - let expected_freqs = ENGLISH_LETTER_FREQS.iter().map(|x| x * num_alphabetic_chars); - let score = freqs.into_iter() - .zip(expected_freqs) - .fold(0f32, |acc, (obs, exp)| acc + ((obs - exp).powf(2.0) / exp)); - score - } - #[test] fn cryptopals13() { let input = "1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736"; @@ -86,7 +60,7 @@ mod tests { .single_byte_xor(key) .map(char::from) .collect::(); - let score = letter_freq_score(&possible_output); + let score = possible_output.chi2_freqs("en"); println!("{}: {:?} -> {}", key, possible_output, score); if !score.is_nan() && score < best_score { best_score = score;