FennNaten · July 10, 2017 01:09
diff --git a/image-queue.js b/image-queue.js
 'use strict';

 /**
 * this is a fork of https://gist.github.com/sararob/275b252d1eda3a5baa27d6464d2f2198 
 * with some code added to show how reading from and writing to the big source and destination files
 * can be done with streams to reduce memory pressure
 * code added is there as an example to illustrate the principles involved and is not tested, so it may contains bugs
 */

 // Dependencies
 const gcloud = require('google-cloud', {
  projectId: 'sara-bigquery',
  keyfileName: 'keyfile.json'
 });
 const vision = gcloud.vision();
 const fs = require('fs');
 const async = require('async');

 //to avoid having to create ourselves a stream which emits data per line, let's use https://www.npmjs.com/package/line-by-line
 const LineByLineReader = require('line-by-line'); 

 //this will be the file writer into which we'll dump data as soon as we get them to not store them in memory
 //see https://nodejs.org/api/stream.html#stream_writable_streams
 const targetStream = fs.createWriteStream('./responses.json');
 //as final format is a json with an array of objects, and we'll push objects one by one, let's push the character to open the array
 targetStream.write('[');

 const features = [
  { "type": "FACE_DETECTION" },
  { "type": "LABEL_DETECTION" },
  { "type": "LANDMARK_DETECTION" },
  { "type": "WEB_DETECTION" },
  { "type": "IMAGE_PROPERTIES" }
 ];

 // Call the Vision API with our GCS image URL
 function callVision(task, callback) {
  console.log(`processing ${task.object_id}`);

  let visionReq = {
    "image": {
      "source": {
        "imageUri": `gs://gcs-storage-bucket/${task.id}.jpg`
      }
    },
    "features": features
  }

  vision.annotate(visionReq, function(err, annotations, resp) {
    if (err) {
      return callback(new Error(err));
    }

    let imgMetadata = annotations[0];

    if (!imgMetadata['imagePropertiesAnnotation']) {
      return callback(new Error('no data'));
    }

    imgMetadata['object_id'] = task.name;
    //send the stringified object to the target file immediately, no need for keeping that in memory
    //here I use a global targetStream, if I were to make this a library the target stream would probably be injected into callVision instead  
    targetStream.write(JSON.stringify(imgMetadata));
    return callback();
  });
 }

 // The queue will send the image IDs to callVision() 
 let q = async.queue(callVision, 20);

 function done() {
   q.drain = null;
  //when all data is processed, let's end the write stream
  //depending on the way everything is implemented, there could be a bug here if the queue drains faster than we fill it
  //to avoid this, we may add some bookkeeping: a line counter incremented on each read, a boolean indicating read end
  //and a counter on each write. we would then only end stream if drain is emitted and allRead is true and number of written items === number of read lines  
  targetStream.end(']');
 }

 // Will only be executed when the queue is drained
 q.drain = done;

 //now let's create our file stream to read data from the file line by line
 const imageIdsStream = new LineByLineReader('./image-ids.json');

 //launch the read process. Each time a line is read, it is parsed and fed to the queue
 //here each line is an imageID
 imageIdsStream.on('line', function (line) {
  q.push(JSON.parse(line), function (err) {
   if (err) {
      console.log(err)
    }
  });
 });
	'use strict';

	/**
	* this is a fork of https://gist.github.com/sararob/275b252d1eda3a5baa27d6464d2f2198
	* with some code added to show how reading from and writing to the big source and destination files
	* can be done with streams to reduce memory pressure
	* code added is there as an example to illustrate the principles involved and is not tested, so it may contains bugs
	*/

	// Dependencies
	const gcloud = require('google-cloud', {
	projectId: 'sara-bigquery',
	keyfileName: 'keyfile.json'
	});
	const vision = gcloud.vision();
	const fs = require('fs');
	const async = require('async');

	//to avoid having to create ourselves a stream which emits data per line, let's use https://www.npmjs.com/package/line-by-line
	const LineByLineReader = require('line-by-line');

	//this will be the file writer into which we'll dump data as soon as we get them to not store them in memory
	//see https://nodejs.org/api/stream.html#stream_writable_streams
	const targetStream = fs.createWriteStream('./responses.json');
	//as final format is a json with an array of objects, and we'll push objects one by one, let's push the character to open the array
	targetStream.write('[');

	const features = [
	{ "type": "FACE_DETECTION" },
	{ "type": "LABEL_DETECTION" },
	{ "type": "LANDMARK_DETECTION" },
	{ "type": "WEB_DETECTION" },
	{ "type": "IMAGE_PROPERTIES" }
	];

	// Call the Vision API with our GCS image URL
	function callVision(task, callback) {
	console.log(`processing ${task.object_id}`);

	let visionReq = {
	"image": {
	"source": {
	"imageUri": `gs://gcs-storage-bucket/${task.id}.jpg`
	}
	},
	"features": features
	}

	vision.annotate(visionReq, function(err, annotations, resp) {
	if (err) {
	return callback(new Error(err));
	}

	let imgMetadata = annotations[0];

	if (!imgMetadata['imagePropertiesAnnotation']) {
	return callback(new Error('no data'));
	}

	imgMetadata['object_id'] = task.name;
	//send the stringified object to the target file immediately, no need for keeping that in memory
	//here I use a global targetStream, if I were to make this a library the target stream would probably be injected into callVision instead
	targetStream.write(JSON.stringify(imgMetadata));
	return callback();
	});
	}

	// The queue will send the image IDs to callVision()
	let q = async.queue(callVision, 20);

	function done() {
	q.drain = null;
	//when all data is processed, let's end the write stream
	//depending on the way everything is implemented, there could be a bug here if the queue drains faster than we fill it
	//to avoid this, we may add some bookkeeping: a line counter incremented on each read, a boolean indicating read end
	//and a counter on each write. we would then only end stream if drain is emitted and allRead is true and number of written items === number of read lines
	targetStream.end(']');
	}

	// Will only be executed when the queue is drained
	q.drain = done;

	//now let's create our file stream to read data from the file line by line
	const imageIdsStream = new LineByLineReader('./image-ids.json');

	//launch the read process. Each time a line is read, it is parsed and fed to the queue
	//here each line is an imageID
	imageIdsStream.on('line', function (line) {
	q.push(JSON.parse(line), function (err) {
	if (err) {
	console.log(err)
	}
	});
	});