Last active
November 26, 2023 11:08
-
-
Save Toparvion/8c79ef0553caf0f3108a4bdbad6bb6d0 to your computer and use it in GitHub Desktop.
A simple benchmark for comparing compression ratios of various compressing algorithms applied to a natural text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package pro.toparvion.stegotext.compress; | |
import org.apache.commons.compress.compressors.CompressorException; | |
import org.apache.commons.compress.compressors.CompressorStreamFactory; | |
import org.apache.commons.compress.utils.IOUtils; | |
import org.junit.jupiter.api.AfterAll; | |
import org.junit.jupiter.api.BeforeAll; | |
import org.junit.jupiter.api.DisplayName; | |
import org.junit.jupiter.api.TestInstance; | |
import org.junit.jupiter.params.ParameterizedTest; | |
import org.junit.jupiter.params.provider.ValueSource; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import java.io.*; | |
import java.nio.file.Files; | |
import java.nio.file.Path; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.List; | |
import static java.nio.file.StandardOpenOption.*; | |
import static org.junit.jupiter.api.Assertions.assertArrayEquals; | |
import static org.junit.jupiter.api.TestInstance.Lifecycle.PER_CLASS; | |
/** | |
* A simple benchmark for comparing compression ratios of various compressing algorithms applied to a natural text | |
* | |
* @author Toparvion | |
*/ | |
@TestInstance(PER_CLASS) | |
public class CompressTest { | |
private static final Logger log = LoggerFactory.getLogger(CompressTest.class); | |
private static final List<Integer> INPUT_SIZES = | |
List.of(20, 50, 100, 200, 300, 500, 1000, 2000, 5000, 10_000, 50_000, 100_000); | |
private static final CompressorStreamFactory COMPRESSOR_FACTORY = new CompressorStreamFactory(); | |
private static final Path SOURCE_TEXT_PATH = Path.of("sandbox/doyle-return-388.txt"); | |
// private static final Path SOURCE_TEXT_PATH = Path.of("sandbox/pepko.txt"); | |
private final List<List<String>> csvOut = new ArrayList<>(); | |
private byte[] textSampleBytes; | |
@BeforeAll | |
void beforeAll() throws IOException { | |
List<String> header = new ArrayList<>(INPUT_SIZES.size() + 1); | |
header.add("Algo"); | |
INPUT_SIZES.stream() | |
.map(String::valueOf) | |
.forEach(header::add); | |
csvOut.add(header); | |
textSampleBytes = Files.readAllBytes(SOURCE_TEXT_PATH); | |
} | |
@AfterAll | |
void afterAll() throws IOException { | |
List<String> csvLines = csvOut.stream() | |
.map(line -> String.join(",", line)) | |
.toList(); | |
Path csvFilePath = Path.of("compress.csv"); | |
Files.write(csvFilePath, csvLines, CREATE, WRITE, TRUNCATE_EXISTING); | |
log.info("Written {} lines to '{}'", csvLines.size(), csvFilePath); | |
} | |
@ParameterizedTest(name = "Algorithm: {0}") | |
@ValueSource(strings = { | |
// CompressorStreamFactory.BROTLI, // read-only | |
CompressorStreamFactory.BZIP2, | |
CompressorStreamFactory.DEFLATE, | |
CompressorStreamFactory.GZIP, | |
CompressorStreamFactory.LZMA, | |
CompressorStreamFactory.LZ4_BLOCK, | |
CompressorStreamFactory.LZ4_FRAMED, | |
CompressorStreamFactory.SNAPPY_FRAMED, | |
// CompressorStreamFactory.SNAPPY_RAW, // Compressor: snappy-raw not found. | |
CompressorStreamFactory.XZ, | |
// CompressorStreamFactory.Z, // read-only | |
CompressorStreamFactory.ZSTANDARD | |
}) | |
@DisplayName("Compression test suite for Apache Commons Compress") | |
void testCompression(String algo) throws CompressorException, IOException { | |
List<String> csvLine = new ArrayList<>(INPUT_SIZES.size() + 1); | |
csvLine.add(algo); | |
for (int inputSize : INPUT_SIZES) { | |
// given | |
byte[] sourceBytes = Arrays.copyOfRange(textSampleBytes, 31_810, (31_810 + inputSize)); | |
// when | |
var compressedSourceBytes = compress(sourceBytes, algo); | |
var decompressedSourceBytes = decompress(compressedSourceBytes, algo); | |
//then | |
int sourceLength = sourceBytes.length; | |
int resultLength = compressedSourceBytes.length; | |
double delta = ((sourceLength - resultLength) / (double) sourceLength) * 100.00; | |
log.info("Algo: {}, source size: {}, compressed size: {}, delta: {}", algo, sourceLength, resultLength, delta); | |
assertArrayEquals(sourceBytes, decompressedSourceBytes); | |
csvLine.add(String.valueOf(resultLength)); | |
} | |
csvOut.add(csvLine); | |
} | |
private byte[] compress(byte[] sourceBytes, String algo) throws CompressorException, IOException { | |
var inStream = new ByteArrayInputStream(sourceBytes); | |
var outStream = new ByteArrayOutputStream(); | |
var bufOutStream = new BufferedOutputStream(outStream); | |
try (var compressStream = COMPRESSOR_FACTORY.createCompressorOutputStream(algo, bufOutStream)) { | |
IOUtils.copy(inStream, compressStream); | |
} | |
return outStream.toByteArray(); | |
} | |
private byte[] decompress(byte[] compressed, String algo) throws CompressorException, IOException { | |
var inStream = new BufferedInputStream(new ByteArrayInputStream(compressed)); | |
var outStream = new ByteArrayOutputStream(); | |
try (var compressStream = COMPRESSOR_FACTORY.createCompressorInputStream(algo, inStream)) { | |
IOUtils.copy(compressStream, outStream); | |
} | |
return outStream.toByteArray(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dependencies
In Gradle Kotlin DSL format:
Input
sandbox
directory (seeSOURCE_TEXT_PATH
constant)Output
compress.csv
file with all the gathered data (the lengths of result byte arrays)Sample output
The CSV output may look like
It can be converted to compression ratios the way the
delta
variable is computed:This allows the results to be presented in a more readable way, e.g. as a table:
or as a chart: