Last active
February 5, 2022 21:41
-
-
Save SCOTT-HAMILTON/568e619115343d17cd5045b36b3b3900 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from anytree import Node, RenderTree | |
from functools import reduce | |
from itertools import groupby | |
import numpy as np | |
import os | |
import struct | |
class InvalidFileHeader(Exception): | |
pass | |
def make_occurrences_table(cleartext): | |
return sorted( | |
map(lambda g: (g[0], len(list(g[1]))), groupby(sorted(cleartext), lambda x: x)), | |
key=lambda g: g[1], | |
) | |
def make_bin_tree(left, right): | |
n = Node("") | |
if left: | |
left.parent = n | |
if right: | |
right.parent = n | |
return n | |
def tree_2_path_dict(huff_tree): | |
def node_2_bin_path(node, orig=None, p=0, n=0): | |
if orig == None: | |
orig = node.name | |
bro = node.parent.children[0] | |
if node != bro: | |
b = p | |
p = p | (1 << n) | |
n += 1 | |
parent = node.parent | |
if parent and len(parent.siblings) > 0: | |
return node_2_bin_path(parent, orig, p, n) | |
else: | |
p = p | (1 << n) | |
return (orig, p) | |
return dict(map(node_2_bin_path, huff_tree.leaves)) | |
def encode(cleartext, path_dict): | |
def concat_bits(a, b): | |
return (a << (b.bit_length() - 1)) | (b & (2 ** (b.bit_length() - 1) - 1)) | |
return reduce(concat_bits, map(lambda c: path_dict[c], cleartext)) | |
def left(node): | |
if len(node.children) >= 1: | |
return node.children[0] | |
else: | |
return None | |
def right(node): | |
if len(node.children) >= 2: | |
return node.children[1] | |
else: | |
return None | |
def make_huffman_tree_2(oc_table): | |
def make_huff_tree_iter(nl): | |
if len(nl) % 2 == 0: | |
cnl = nl | |
last = None | |
else: | |
last = nl[-1] | |
cnl = nl[: len(nl) - 1] | |
n = list( | |
map( | |
lambda x: make_bin_tree(x[0], x[1]), | |
np.asarray(cnl).reshape(int(len(cnl) / 2), 2).tolist(), | |
) | |
) | |
if last: | |
n += [last] | |
return n | |
def make_complete_tree(nl): | |
while len(nl) > 1: | |
nl = make_huff_tree_iter(nl) | |
return nl[0] | |
def group2tree(g): | |
return make_complete_tree(list(map(lambda x: Node(x[0]), g))) | |
return make_complete_tree( | |
[group2tree(g) for _, g in groupby(oc_table, lambda x: x[1])] | |
) | |
def encode_to_file(text, out_file): | |
oc_table = make_occurrences_table(text) | |
huff_tree = make_huffman_tree_2(oc_table) | |
path_dict = tree_2_path_dict(huff_tree) | |
encoded = encode(text, path_dict) | |
l = ( | |
np.array( | |
list(map(lambda item: [ord(item[0]), item[1]], oc_table)) + [[0, 0]], | |
dtype="int32", | |
) | |
.flatten() | |
.tolist() | |
) | |
b = b"".join(list(map(lambda n: struct.pack("I", n), l))) | |
with open(out_file, "wb") as file: | |
file.write(b) | |
size = int(encoded.bit_length() / 8) + 1 | |
file.write(encoded.to_bytes(size, "little")) | |
def decode_from_file(in_file): | |
def read_oc_table(file): | |
prev = 0x01 | |
l = [] | |
while True: | |
try: | |
one = file.read(2) | |
two = file.read(2) | |
except OSError: | |
raise InvalidFileHeader | |
if one == b"\x00\x00" and two == b"\x00\x00": | |
break | |
else: | |
l += [struct.unpack("I", one + two)[0]] | |
if len(l) % 2 != 0: | |
raise InvalidFileHeader | |
else: | |
nchunks = int(len(l) / 2) | |
return list( | |
map( | |
lambda g: (chr(g[0]), g[1]), | |
(np.asarray(l).reshape(nchunks, 2).tolist()), | |
) | |
) | |
def read_data(file): | |
try: | |
while file.read(1) == b"\x00": | |
pass | |
file.seek(file.tell() - 1) | |
data = file.read() | |
return data | |
except OSError: | |
raise InvalidFileHeader | |
with open(in_file, "rb") as file: | |
oc_table = read_oc_table(file) | |
data = int.from_bytes(read_data(file), "little") | |
huff_tree = make_huffman_tree_2(oc_table) | |
cnode = huff_tree | |
decoded = "" | |
for c in range(2, data.bit_length() + 1): | |
def bit(n, a): | |
return (a >> (a.bit_length() - n)) & 1 | |
cb = bit(c, data) | |
p1 = cnode.path | |
cnode = right(cnode) if cb else left(cnode) | |
if cnode.is_leaf: | |
decoded += cnode.name | |
cnode = huff_tree | |
return decoded | |
text = """" | |
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Amet tellus cras adipiscing enim eu turpis egestas pretium. In ante metus dictum at tempor commodo ullamcorper. Nulla pharetra diam sit amet nisl suscipit adipiscing bibendum. Turpis nunc eget lorem dolor sed viverra. Egestas dui id ornare arcu odio ut sem. Commodo quis imperdiet massa tincidunt nunc pulvinar sapien et. Nunc non blandit massa enim. Morbi tincidunt ornare massa eget. Sed adipiscing diam donec adipiscing tristique risus nec feugiat in. Interdum varius sit amet mattis vulputate enim. In hendrerit gravida rutrum quisque non. Sed blandit libero volutpat sed cras. Posuere ac ut consequat semper viverra. Et ultrices neque ornare aenean euismod elementum nisi quis eleifend. Imperdiet sed euismod nisi porta lorem mollis aliquam. Adipiscing vitae proin sagittis nisl rhoncus. Vulputate eu scelerisque felis imperdiet proin fermentum leo vel orci. | |
""" | |
file = "out.bin" | |
encode_to_file(text, file) | |
decoded = decode_from_file(file) | |
print(f"decoded text=`{decoded}`") | |
fs = os.path.getsize(file) | |
pc = int(fs/len(text)*100) | |
print(f"{len(text)} bytes compressed to {fs} bytes ({pc}%).") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment