mildsunrise · July 23, 2024 23:30
diff --git a/lz_string.md b/lz_string.md
diff --git a/lz_string.py b/lz_string.py
 from typing import Iterator, Iterable

 # GENERIC BITSTREAM UTILITIES
 # these take/produce the bits in BE (MSB-first) order

 class BitReader:
 	def __init__(self, bytes: Iterable[int]):
 		self.bits = ( (x >> i) & 1 for x in bytes for i in reversed(range(8)) )

 	def read_int_le(self, size: int) -> int:
 		''' consume a fixed-size LE (LSB-first) integer '''
 		result = 0
 		for i in range(size):
 			result |= next(self.bits) << i
 		return result

 class BitWriter:
 	def __init__(self):
 		self.bytes = bytearray()
 		self.pos = 0

 	def write_bit(self, bit: int):
 		if self.pos == 0:
 			self.bytes.append(0)
 			self.pos = 8
 		self.pos -= 1
 		self.bytes[-1] |= bit << self.pos

 	def write_int_le(self, x: int, size: int):
 		''' write a fixed-size LE (LSB-first) integer '''
 		for i in range(size):
 			self.write_bit((x >> i) & 1)
 		assert not x >> size


 # DECOMPRESSION

 def decode_packets(data: bytes):
 	''' decode the packets from a byte buffer '''
 	bits = BitReader(data)
 	max_opcode = 2
 	while (opcode := bits.read_int_le(max_opcode.bit_length())) != 2:
 		if opcode < 2:
 			yield "literal", bits.read_int_le([8, 16][opcode])
 			max_opcode += 2
 		else:
 			yield "dictionary index", opcode - 3
 			max_opcode += 1
 	padding = list(bits.bits)
 	assert len(padding) < 24 and not any(padding)

 def decompress(data: bytes):
 	''' decompress a buffer of bytes into a stream of UTF-16 code units '''
 	dictionary = []
 	last_entry = None
 	for kind, value in decode_packets(data):
 		if kind == "literal":
 			entry = [value]
 			dictionary.append(entry)
 		elif value == len(dictionary):
 			entry = last_entry + entry[:1]
 		else:
 			entry = dictionary[value]

 		yield from entry
 		if last_entry:
 			dictionary.append(last_entry + entry[:1])
 		last_entry = entry


 # COMPRESSION

 def compress(data: list[int]) -> bytes:
 	''' compress a sequence of UTF-16 code units into a byte buffer '''
 	bits = BitWriter()
 	max_opcode = 2
 	write_opcode = lambda x: bits.write_int_le(x, max_opcode.bit_length())
 	for kind, value in compress_to_packets(data):
 		if kind == 'literal':
 			write_opcode(opcode := 0 if value.bit_length() <= 8 else 1)
 			bits.write_int_le(value, [8, 16][opcode])
 			max_opcode += 2
 		else:
 			write_opcode(value + 3)
 			max_opcode += 1
 	write_opcode(2)
 	return bytes(bits.bytes)

 def compress_to_packets(data: list[int]):
 	''' compress a sequence of UTF-16 code units into packets '''
 	dictionary = {}
 	last_entry = None
 	while data:
 		entry = ()
 		while len(entry) < len(data) and entry + (data[len(entry)],) in dictionary:
 			entry += (data[len(entry)],)

 		if entry == last_entry and len(entry) < len(data) and data[len(entry)] == entry[0]:
 			entry += (data[len(entry)],)
 			yield 'dictionary index', len(dictionary)
 		elif entry:
 			yield 'dictionary index', dictionary[entry]
 		else:
 			entry += (data[len(entry)],)
 			dictionary[entry] = len(dictionary)
 			yield 'literal', data[0]

 		data = data[len(entry):]
 		if last_entry:
 			dictionary[last_entry + entry[:1]] = len(dictionary)
 		last_entry = entry


 # MAIN API

 from array import array

 def decompress_to_string(data: bytes) -> str:
 	code_units = array('H', [0xFEFF]) # BOM
 	code_units.extend(decompress(data))
 	return str(code_units, 'utf-16')

 def compress_from_string(data: str) -> bytes:
 	code_units = array('H', data.encode('utf-16'))
 	return compress(code_units[1:])

 from base64 import b64decode, b64encode

 def decompress_uricomponent(data: str) -> str:
 	''' emulates decompressFromEncodedURIComponent '''
 	assert all(x.isalnum() or x in '+-' for x in data)
 	# the original implementation (old version) contains a bug that causes incomplete
 	# Base64 to be emitted, so correct for that or b64decode will fail:
 	data += 'A' * ((-len(data)) % 4)
 	# it doesn't use the standard urlsafe alphabet, and the custom alphabet
 	# still keeps the (URL-sensitive) '+', which defeats the purpose :)
 	return decompress_to_string(b64decode(data, '+-'))

 def compress_uricomponent(data: str) -> str:
 	b64 = b64encode(compress_from_string(data), b'+-').decode('ascii')
 	while b64.endswith('='): b64 = b64[:-1] # remove padding
 	return b64
	from typing import Iterator, Iterable

	# GENERIC BITSTREAM UTILITIES
	# these take/produce the bits in BE (MSB-first) order

	class BitReader:
	def __init__(self, bytes: Iterable[int]):
	self.bits = ( (x >> i) & 1 for x in bytes for i in reversed(range(8)) )

	def read_int_le(self, size: int) -> int:
	''' consume a fixed-size LE (LSB-first) integer '''
	result = 0
	for i in range(size):
	result \|= next(self.bits) << i
	return result

	class BitWriter:
	def __init__(self):
	self.bytes = bytearray()
	self.pos = 0

	def write_bit(self, bit: int):
	if self.pos == 0:
	self.bytes.append(0)
	self.pos = 8
	self.pos -= 1
	self.bytes[-1] \|= bit << self.pos

	def write_int_le(self, x: int, size: int):
	''' write a fixed-size LE (LSB-first) integer '''
	for i in range(size):
	self.write_bit((x >> i) & 1)
	assert not x >> size


	# DECOMPRESSION

	def decode_packets(data: bytes):
	''' decode the packets from a byte buffer '''
	bits = BitReader(data)
	max_opcode = 2
	while (opcode := bits.read_int_le(max_opcode.bit_length())) != 2:
	if opcode < 2:
	yield "literal", bits.read_int_le([8, 16][opcode])
	max_opcode += 2
	else:
	yield "dictionary index", opcode - 3
	max_opcode += 1
	padding = list(bits.bits)
	assert len(padding) < 24 and not any(padding)

	def decompress(data: bytes):
	''' decompress a buffer of bytes into a stream of UTF-16 code units '''
	dictionary = []
	last_entry = None
	for kind, value in decode_packets(data):
	if kind == "literal":
	entry = [value]
	dictionary.append(entry)
	elif value == len(dictionary):
	entry = last_entry + entry[:1]
	else:
	entry = dictionary[value]

	yield from entry
	if last_entry:
	dictionary.append(last_entry + entry[:1])
	last_entry = entry


	# COMPRESSION

	def compress(data: list[int]) -> bytes:
	''' compress a sequence of UTF-16 code units into a byte buffer '''
	bits = BitWriter()
	max_opcode = 2
	write_opcode = lambda x: bits.write_int_le(x, max_opcode.bit_length())
	for kind, value in compress_to_packets(data):
	if kind == 'literal':
	write_opcode(opcode := 0 if value.bit_length() <= 8 else 1)
	bits.write_int_le(value, [8, 16][opcode])
	max_opcode += 2
	else:
	write_opcode(value + 3)
	max_opcode += 1
	write_opcode(2)
	return bytes(bits.bytes)

	def compress_to_packets(data: list[int]):
	''' compress a sequence of UTF-16 code units into packets '''
	dictionary = {}
	last_entry = None
	while data:
	entry = ()
	while len(entry) < len(data) and entry + (data[len(entry)],) in dictionary:
	entry += (data[len(entry)],)

	if entry == last_entry and len(entry) < len(data) and data[len(entry)] == entry[0]:
	entry += (data[len(entry)],)
	yield 'dictionary index', len(dictionary)
	elif entry:
	yield 'dictionary index', dictionary[entry]
	else:
	entry += (data[len(entry)],)
	dictionary[entry] = len(dictionary)
	yield 'literal', data[0]

	data = data[len(entry):]
	if last_entry:
	dictionary[last_entry + entry[:1]] = len(dictionary)
	last_entry = entry


	# MAIN API

	from array import array

	def decompress_to_string(data: bytes) -> str:
	code_units = array('H', [0xFEFF]) # BOM
	code_units.extend(decompress(data))
	return str(code_units, 'utf-16')

	def compress_from_string(data: str) -> bytes:
	code_units = array('H', data.encode('utf-16'))
	return compress(code_units[1:])

	from base64 import b64decode, b64encode

	def decompress_uricomponent(data: str) -> str:
	''' emulates decompressFromEncodedURIComponent '''
	assert all(x.isalnum() or x in '+-' for x in data)
	# the original implementation (old version) contains a bug that causes incomplete
	# Base64 to be emitted, so correct for that or b64decode will fail:
	data += 'A' * ((-len(data)) % 4)
	# it doesn't use the standard urlsafe alphabet, and the custom alphabet
	# still keeps the (URL-sensitive) '+', which defeats the purpose :)
	return decompress_to_string(b64decode(data, '+-'))

	def compress_uricomponent(data: str) -> str:
	b64 = b64encode(compress_from_string(data), b'+-').decode('ascii')
	while b64.endswith('='): b64 = b64[:-1] # remove padding
	return b64