traviskaufman · October 31, 2019 01:50
diff --git a/scrape-timestamps.js b/scrape-timestamps.js
 const fs = require('fs');
 const fetch = require('node-fetch');
 const cheerio = require('cheerio');

 main().catch(err => console.error(err));

 async function main() {
  const SCARE_RE = /^((\d{1,2}):(\d{1,2})(?::(\d{1,2}))?).*– (.+)$/;
  const $ = cheerio.load(fs.readFileSync('./notebooks/movielist.html', 'utf8'));
  const $movieNames = $('td.column-1 > a');
  const $diretors = $('td.column-2');
  const $jumpCounts = $('td.column-4');
  const movieUrls = $movieNames.map(function (i) {
    const $el = $(this);
    return {
      title: $el.text().trim().replace(/\s+/gm, ' '),
      director: $diretors.eq(i).text().trim(),
      link: $el.attr('href'),
      numScares: parseInt($jumpCounts.eq(i).text().trim(), 10),
    };
  });

  const csv = []
  const errors = []
  for (let i = 0; i < movieUrls.length; i++) {
    const movieUrl = movieUrls[i];
    if (movieUrl.numScares == 0) {
      console.debug(`Skipping ${movieUrl.title} b/c it has no jump scares`);
      continue;
    }

    console.debug('Fetching jump scare info for', movieUrl.title);
    const html = await (await fetch(movieUrl.link)).text();
    const $ = cheerio.load(html);
    const $ps = $('p');
    const ptext = $ps.map(function () {
      return $(this).text().trim();
    }).toArray();
    const timestampMatches = ptext.map(t => t.match(SCARE_RE)).filter(Boolean);
    if (timestampMatches.length != movieUrl.numScares) {
      console.warn('NOTE: for movie', movieUrl.title, `number of timestamps found (${timestampMatches.length}) did not match number of scares specified (${movieUrl.numScares}). Go check this manually`);
      errors.push({
        'Movie Name': movieUrl.title,
        'Director': movieUrl.director,
        'Number of timestamps found': timestampMatches.length,
        'Number of timestamps specified': movieUrl.numScares,
      })
    }
    for (const m of timestampMatches) {
      const ts = formatTimestamp(m[1]);
      csv.push({
        'Movie Name': movieUrl.title,
        'Director': movieUrl.director,
        'Timestamp': ts,
        'Timestamp Seconds': formattedTimestampToSeconds(ts),
        'Description': m[5],
      })
    }
    console.log('Added', timestampMatches.length, 'timestamps from', movieUrl.title);
  }
  fs.writeFileSync('movietimestamps.json', JSON.stringify(csv, null, 2), 'utf-8');
  fs.writeFileSync('errors.json', JSON.stringify(errors, null, 2), 'utf-8');
 }

 function formatTimestamp(ts) {
  // These are only formatted as 'm:ss', 'h:mm:ss', or 'mm:ss'
  const parts = ts.split(':');
  let hour = '0';
  let min = '0';
  let sec = '0';
  if (parts.length > 2) {
    hour = parts[0];
    min = parts[1];
    sec = parts[2];
  } else {
    min = parts[0];
    sec = parts[1];
  }
  const joined = [hour, min, sec].map(p => p.length < 2 ? `0${p}` : p).join(':');
  return joined;
 }

 function formattedTimestampToSeconds(ts) {
  const [hr, min, sec] = ts.split(':');
  return parseInt(hr, 10) * 60 * 60 + parseInt(min, 10) * 60 + parseInt(sec, 10);
 }
	const fs = require('fs');
	const fetch = require('node-fetch');
	const cheerio = require('cheerio');

	main().catch(err => console.error(err));

	async function main() {
	const SCARE_RE = /^((\d{1,2}):(\d{1,2})(?::(\d{1,2}))?).*– (.+)$/;
	const $ = cheerio.load(fs.readFileSync('./notebooks/movielist.html', 'utf8'));
	const $movieNames = $('td.column-1 > a');
	const $diretors = $('td.column-2');
	const $jumpCounts = $('td.column-4');
	const movieUrls = $movieNames.map(function (i) {
	const $el = $(this);
	return {
	title: $el.text().trim().replace(/\s+/gm, ' '),
	director: $diretors.eq(i).text().trim(),
	link: $el.attr('href'),
	numScares: parseInt($jumpCounts.eq(i).text().trim(), 10),
	};
	});

	const csv = []
	const errors = []
	for (let i = 0; i < movieUrls.length; i++) {
	const movieUrl = movieUrls[i];
	if (movieUrl.numScares == 0) {
	console.debug(`Skipping ${movieUrl.title} b/c it has no jump scares`);
	continue;
	}

	console.debug('Fetching jump scare info for', movieUrl.title);
	const html = await (await fetch(movieUrl.link)).text();
	const $ = cheerio.load(html);
	const $ps = $('p');
	const ptext = $ps.map(function () {
	return $(this).text().trim();
	}).toArray();
	const timestampMatches = ptext.map(t => t.match(SCARE_RE)).filter(Boolean);
	if (timestampMatches.length != movieUrl.numScares) {
	console.warn('NOTE: for movie', movieUrl.title, `number of timestamps found (${timestampMatches.length}) did not match number of scares specified (${movieUrl.numScares}). Go check this manually`);
	errors.push({
	'Movie Name': movieUrl.title,
	'Director': movieUrl.director,
	'Number of timestamps found': timestampMatches.length,
	'Number of timestamps specified': movieUrl.numScares,
	})
	}
	for (const m of timestampMatches) {
	const ts = formatTimestamp(m[1]);
	csv.push({
	'Movie Name': movieUrl.title,
	'Director': movieUrl.director,
	'Timestamp': ts,
	'Timestamp Seconds': formattedTimestampToSeconds(ts),
	'Description': m[5],
	})
	}
	console.log('Added', timestampMatches.length, 'timestamps from', movieUrl.title);
	}
	fs.writeFileSync('movietimestamps.json', JSON.stringify(csv, null, 2), 'utf-8');
	fs.writeFileSync('errors.json', JSON.stringify(errors, null, 2), 'utf-8');
	}

	function formatTimestamp(ts) {
	// These are only formatted as 'm:ss', 'h:mm:ss', or 'mm:ss'
	const parts = ts.split(':');
	let hour = '0';
	let min = '0';
	let sec = '0';
	if (parts.length > 2) {
	hour = parts[0];
	min = parts[1];
	sec = parts[2];
	} else {
	min = parts[0];
	sec = parts[1];
	}
	const joined = [hour, min, sec].map(p => p.length < 2 ? `0${p}` : p).join(':');
	return joined;
	}

	function formattedTimestampToSeconds(ts) {
	const [hr, min, sec] = ts.split(':');
	return parseInt(hr, 10) * 60 * 60 + parseInt(min, 10) * 60 + parseInt(sec, 10);
	}