Skip to content

Instantly share code, notes, and snippets.

@asiellb
Forked from bennadel/data.ndjson
Created July 10, 2022 18:44
Show Gist options
  • Save asiellb/62d6b88be18dc4a8a6ad56d7978bdff8 to your computer and use it in GitHub Desktop.
Save asiellb/62d6b88be18dc4a8a6ad56d7978bdff8 to your computer and use it in GitHub Desktop.
Parsing And Serializing Large Datasets Using Newline-Delimited JSON In Node.js
{"id":1,"name":"O Brother, Where Art Thou?"}
{"id":2,"name":"Home for the Holidays"}
{"id":3,"name":"The Firm"}
{"id":4,"name":"Broadcast News"}
{"id":5,"name":"Raising Arizona"}
// Require the core node modules.
var chalk = require( "chalk" );
var fileSystem = require( "fs" );
var ndjson = require( "ndjson" );
// ----------------------------------------------------------------------------------- //
// ----------------------------------------------------------------------------------- //
// Imagine that we are performing some sort of data migration and we have to move data
// from one database to flat files; then transport those flat files elsewhere; then,
// import those flat files into a different database.
var records = [
{ id: 1, name: "O Brother, Where Art Thou?" },
{ id: 2, name: "Home for the Holidays" },
{ id: 3, name: "The Firm" },
{ id: 4, name: "Broadcast News" },
{ id: 5, name: "Raising Arizona" }
// .... hundreds of thousands of records ....
];
// Traditionally, we might store ONE JSON document PER FILE. However, this has some
// serious implications once we move out of local development environment and into
// production. As the JSON documents grow in size, we run the risk of running out of
// memory (during the serialization and parsing process). To get around this, we can
// use a slightly different storage format in which our data file is not ONE JSON
// document PER FILE, but rather ONE JSON document PER LINE. This is known as "ndjson"
// or "Newline-Delimited JSON". To use this format, we're going to create an ndjson
// Transform stream (aka "through" stream) that takes each JavaScript object and
// writes it as a newline-delimited String to the output stream (which will be a
// file-output stream in our case).
// --
// NOTE: We're using .ndjson - NOT .json - for this storage format.
var transformStream = ndjson.stringify();
// Pipe the ndjson serialized output to the file-system.
var outputStream = transformStream.pipe( fileSystem.createWriteStream( __dirname + "/data.ndjson" ) );
// Iterate over the records and write EACH ONE to the TRANSFORM stream individually.
// Each one of these records will become a line in the output file.
records.forEach(
function iterator( record ) {
transformStream.write( record );
}
);
// Once we've written each record in the record-set, we have to end the stream so that
// the TRANSFORM stream knows to flush and close the file output stream.
transformStream.end();
// Once ndjson has flushed all data to the output stream, let's indicate done.
outputStream.on(
"finish",
function handleFinish() {
console.log( chalk.green( "ndjson serialization complete!" ) );
console.log( "- - - - - - - - - - - - - - - - - - - - - - -" );
}
);
// ----------------------------------------------------------------------------------- //
// ----------------------------------------------------------------------------------- //
// Since the stream actions are event-driven (and asynchronous), we have to wait until
// our output stream has been closed before we can try reading it back in.
outputStream.on(
"finish",
function handleFinish() {
// When we read the file back into memory, ndjson will stream, buffer, and split
// the content based on the newline character. It will then parse each newline-
// delimited value as a JSON object and emit it from the TRANSFORM stream.
var inputStream = fileSystem.createReadStream( __dirname + "/data.ndjson" );
var transformStream = inputStream.pipe( ndjson.parse() );
transformStream
// Each "data" event will emit one item from our original record-set.
.on(
"data",
function handleRecord( data ) {
console.log( chalk.red( "Record (event):" ), data );
}
)
// Once ndjson has parsed all the input, let's indicate done.
.on(
"end",
function handleEnd() {
console.log( "- - - - - - - - - - - - - - - - - - - - - - -" );
console.log( chalk.green( "ndjson parsing complete!" ) );
}
)
;
}
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment