-
-
Save newtonmwai/3f7a6c3d620babcf2f7e605d7cc868d8 to your computer and use it in GitHub Desktop.
Files for cracking substitution ciphers.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/pypy | |
# Change the above #! if you want to use python (slower) rather than pypy | |
import random, re, sys, math | |
from segment import segment | |
# Usage: ./crack.py ciphertext | |
# where ciphertext is (you guessed it) a plaintext file containing the | |
# ciphertext | |
# Currently uses a hill-climbing algorithm, which performs surprisingly well. | |
# TODO In future, simulated annealing is supposed to work a lot better | |
# Code based upon (basically a fork of) | |
# http://practicalcryptography.com/media/cryptanalysis/files/ngram_score_1.py | |
ALPHABET = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
ASCII_OFFSET = ord('A') | |
MAX_ITERATIONS = 1000 | |
# Read ciphertext using filename from cli argument | |
ciphertext = open(sys.argv[1]).read() | |
# Bigrams list, trigram, etc | |
class NGrams(object): | |
# Load and parse ngrams list | |
# File format should be | |
# aard 5 | |
# etc | |
def __init__(self, filename): | |
self.ngrams = {} | |
for line in open(filename): | |
char, count = line.split(" ") | |
self.ngrams[char] = int(count) | |
# The n in ngram | |
self.L = len(char) | |
# Total number of occurences | |
self.N = sum(self.ngrams.itervalues()) | |
# Turn count into a probability, then log it. Done to avoid rounding | |
# errors with small floating point numbers. | |
for char in self.ngrams.keys(): | |
self.ngrams[char] = math.log10(float(self.ngrams[char])/self.N) | |
# Don't want prob to be -infty | |
self.floor = math.log10(0.01/self.N) | |
# Score a given text by comparing it's ngrams | |
def score_text(self, text): | |
score = 0 | |
for i in range(len(text)-self.L+1): | |
# For each segment of the same length as the ngrams | |
# Add it's probability if it's in the list of ngrams | |
# Otherwise add the lowest prob | |
if text[i:i+self.L] in self.ngrams: | |
score += self.ngrams[text[i:i+self.L]] | |
else: score += self.floor | |
return score | |
# Given a ciphertext and possible key, decipher the text to return a possible | |
# plaintext | |
def decipher(text, key): | |
# Find an inverse to the key. e.g. A->B, C->D goes to B->A, D->C | |
inverse = [ALPHABET[key.index(i)%26] for i in ALPHABET] | |
plaintext = '' | |
for char in text: | |
# Only want to decode letters | |
# ord(char.upper())-ASCII_OFFSET just gives us 0..25 value of letter | |
if char.isalpha(): plaintext += inverse[ord(char.upper())-ASCII_OFFSET] | |
else: plaintext += char | |
return plaintext | |
# For the moment, just use quadgrams to score as that seems to work really well | |
# TODO Would adding in lower-n grams, or just using quintgrams give better | |
# performance (at a guess, quintgrams would, but be much slower)? | |
quadgrams = NGrams("quadgrams.txt") | |
def score_key(ciphertext, key): | |
# Decipher and remove anything that's not a letter | |
plaintext = decipher(ciphertext, key) | |
return quadgrams.score_text(re.sub('[^A-Z]', '', plaintext.upper())) | |
# Ready a plaintext for output | |
def format_plaintext(plaintext): | |
# If there are no spaces in the plaintext, segment it into words using | |
# j2kun's library | |
# Not 100% accurate at word segmentation, but a lot better than no | |
# segmentation at all | |
if not (" " in plaintext): | |
words = [] | |
for section in plaintext.upper().split("AND"): | |
words.append(" ".join(segment(section))) | |
plaintext = " and ".join(words) | |
return plaintext | |
# Run the whole decryption process multiple times, as it's a non-deterministic | |
# process | |
best_key = None | |
best_score = -99e99 | |
while 1: | |
# Start with an initial key and score it | |
key = list(ALPHABET) | |
random.shuffle(key) | |
score = score_key(ciphertext, key) | |
# Only change the key a 1000 times with no score improvement before stopping | |
count = 0 | |
while count < 1000: | |
# Create a new key by randomly swapping two positions | |
a = random.randint(0,25) | |
b = random.randint(0,25) | |
new_key = key[:] | |
new_key[a], new_key[b] = new_key[b], new_key[a] | |
# Score the new key | |
new_score = score_key(ciphertext, new_key) | |
# If the new key was better, replace the old one with it | |
if new_score > score: | |
score, key = new_score, new_key[:] | |
count = 0 # Restart the counter for detecting no score improvement | |
count += 1 | |
# Only use this key if it was better than the last algo iteration | |
if score > best_score: | |
best_score = score | |
best_key = key | |
print "=====================================================" | |
print format_plaintext(decipher(ciphertext, best_key)) | |
print "=====================================================" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Download from http://norvig.com/ngrams/count_1w.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Download from http://practicalcryptography.com/media/cryptanalysis/files/english_quadgrams.txt.zip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From https://github.com/j2kun/substitution | |
#!/usr/bin/python | |
import functools, math | |
class OneGramDist(dict): | |
def __init__(self, filename): | |
self.gramCount = 0 | |
for line in open(filename): | |
(word, count) = line[:-1].split('\t') | |
self[word] = int(count) | |
self.gramCount += self[word] | |
def __call__(self, key): | |
if key in self: | |
return float(self[key]) / self.gramCount | |
else: | |
return 1.0 / (self.gramCount * 10**(len(key)-2)) | |
singleWordProb = OneGramDist('one-grams.txt') | |
def wordSeqFitness(words): | |
return sum(math.log10(singleWordProb(w)) for w in words) | |
def memoize(f): | |
cache = {} | |
def memoizedFunction(*args): | |
if args not in cache: | |
cache[args] = f(*args) | |
return cache[args] | |
memoizedFunction.cache = cache | |
return memoizedFunction | |
@memoize | |
def segment(word): | |
if not word: return [] | |
word = word.lower() # change to lower case | |
allSegmentations = [[first] + segment(rest) for (first,rest) in splitPairs(word)] | |
return max(allSegmentations, key = wordSeqFitness) | |
def splitPairs(word, maxLen=20): | |
return [(word[:i+1], word[i+1:]) for i in range(max(len(word), maxLen))] | |
@memoize | |
def segmentWithProb(word): | |
segmented = segment(word) | |
return (wordSeqFitness(segmented), segmented) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment