Skip to content

Instantly share code, notes, and snippets.

@alexshpilkin
Created July 22, 2024 17:45
Show Gist options
  • Save alexshpilkin/a86a34f23744303678e9db140f152680 to your computer and use it in GitHub Desktop.
Save alexshpilkin/a86a34f23744303678e9db140f152680 to your computer and use it in GitHub Desktop.
Pack sparse files into Zstandard-compatible byte streams
/* SPDX-License-Identifier: CC0-1.0 */
#define _GNU_SOURCE /* for copy_file_range */
#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <unistd.h>
static void die(int r, ...) {
static const char colon[2] = ": ", newline[1] = "\n";
int e; va_list ap; const char *s;
e = errno;
va_start(ap, r);
while ((s = va_arg(ap, const char *))) {
write(STDERR_FILENO, s, strlen(s));
write(STDERR_FILENO, colon, sizeof colon);
}
va_end(ap);
s = strerror(e);
write(STDERR_FILENO, s, strlen(s));
write(STDERR_FILENO, newline, sizeof newline);
exit(r);
}
#define SUFFIX ".zst"
static const char *progname = "<unknown>";
int main(int argc, char **argv) {
static struct {
unsigned char magic[4];
unsigned char fdesc;
unsigned char wdesc;
unsigned char size[8];
} header = {
{ 0x28, 0xB5, 0x2F, 0xFD },
};
char *filename = NULL;
int ifd, ofd;
off_t off, len, nxt = 0;
size_t n;
if (argc > 0) {
const char *p = argv[0];
while (*p) if (*p++ == '/') progname = p;
}
if (argc > 1 && (argv[1][0] != '-' || argv[1][1] != 0))
filename = argv[1];
if (argc > 2) {
static const char prefix[7] = "usage: ";
static const char suffix[8] = " [FILE]\n";
write(STDERR_FILENO, prefix, sizeof prefix);
write(STDERR_FILENO, progname, strlen(progname));
write(STDERR_FILENO, suffix, sizeof suffix);
return EX_USAGE;
}
/* Open input and output files */
if (!filename) {
filename = "<stdin>";
ifd = STDIN_FILENO;
ofd = STDOUT_FILENO;
} else {
size_t slash = -1;
int dfd = AT_FDCWD;
char c, *basename;
for (n = 0; filename[n]; n++) if (filename[n] == '/') slash = n;
if (slash != -1) {
c = filename[slash+1]; filename[slash+1] = 0;
#if defined O_SEARCH /* POSIX */
dfd = open(filename, O_SEARCH | O_DIRECTORY);
#elif defined O_PATH /* Linux/Glibc */
dfd = open(filename, O_PATH | O_DIRECTORY);
#else
#error "no *at() functions"
#endif
if (dfd < 0)
die(EX_NOINPUT, filename, "open", NULL);
filename[slash+1] = c;
}
filename += slash + 1; n -= slash + 1;
if ((ifd = openat(dfd, filename, O_RDONLY)) < 0)
die(EX_NOINPUT, filename, "openat", NULL);
basename = malloc(n + sizeof SUFFIX);
if (!basename)
die(EX_OSERR, progname, "malloc", NULL);
memcpy(basename, filename, n);
memcpy(basename + n, SUFFIX, sizeof SUFFIX);
if ((ofd = openat(dfd, basename, O_WRONLY | O_CREAT | O_EXCL, 0666)) < 0)
die(EX_CANTCREAT, basename, "openat", NULL);
}
/* Write Zstandard header */
if ((off = len = lseek(ifd, 0, SEEK_END)) < 0)
die(EX_IOERR, filename, "lseek", NULL);
n = (char *)&header.size[0] - (char *)&header; // FIXME
if (off < 0x100) {
header.fdesc = 1 << 5; /* Single_Segment_Flag */
header.wdesc = off; /* 1-byte length instead of Window_Size */
} else if (len <= 0x10000) {
off -= 0x100; /* 2-byte lengths (only) are biased */
}
if (off >> 8) {
n += 2;
header.fdesc = 0x40; /* Frame_Content_Size_Flag */
header.wdesc = (16 - 10) << 3; /* Window_Size = 2**16 */
header.size[0] = off & 0xFF;
header.size[1] = off >> 8 & 0xFF;
}
if (off >> 16) {
n += 2;
header.fdesc += 0x40;
header.wdesc = (17 - 10) << 3; /* Window_Size = 2**17 */
header.size[2] = off >> 16 & 0xFF;
header.size[3] = off >> 24 & 0xFF;
}
if (off >> 32) {
n += 4;
header.fdesc += 0x40;
header.size[4] = off >> 32 & 0xFF;
header.size[5] = off >> 40 & 0xFF;
header.size[6] = off >> 48 & 0xFF;
header.size[7] = off >> 56 & 0xFF;
}
if (write(ofd, &header, n) != n)
die(EX_IOERR, "write", NULL); // FIXME basename?
/* Write alternating data and holes */
if ((off = lseek(ifd, 0, SEEK_SET)) < 0)
die(EX_IOERR, filename, "lseek", NULL);
if (off == len)
goto writedata; /* write a single zero-length data chunk */
for (;;) {
long chunk;
unsigned long hdata;
unsigned char hbyte[4];
/* Write raw block until start of hole */
nxt = lseek(ifd, 0, SEEK_HOLE);
if (nxt < 0)
die(EX_IOERR, filename, "lseek", NULL);
while (nxt != off) writedata: {
chunk = nxt - off < 1L << 17 ? nxt - off : 1L << 17;
hdata = chunk << 3;
if (off + chunk == len) hdata |= 1;
hbyte[0] = hdata & 0xFF;
hbyte[1] = hdata >> 8 & 0xFF;
hbyte[2] = hdata >> 16 & 0xFF;
if (write(ofd, hbyte, 3) != 3)
die(EX_IOERR, "write", NULL);
if (lseek(ifd, off, SEEK_SET) < 0)
die(EX_IOERR, filename, "lseek", NULL);
if (copy_file_range(ifd, NULL, ofd, NULL, chunk, 0) != chunk)
die(EX_IOERR, "copy_file_range", NULL);
off += chunk;
}
if (off == len) break;
/* Write RLE block until end of hole */
nxt = lseek(ifd, 0, SEEK_DATA);
if (nxt < 0) {
if (errno != ENXIO)
die(EX_IOERR, filename, "lseek", NULL);
nxt = len;
}
while (nxt != off) {
chunk = nxt - off < 1L << 17 ? nxt - off : 1L << 17;
hdata = chunk << 3 | 2;
if (off + chunk == len) hdata |= 1;
hbyte[0] = hdata & 0xFF;
hbyte[1] = hdata >> 8 & 0xFF;
hbyte[2] = hdata >> 16 & 0xFF;
hbyte[3] = 0; /* RLE payload */
if (write(ofd, hbyte, 4) != 4)
die(EX_IOERR, "write", NULL);
off += chunk;
}
if (off == len) break;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment