Created
October 31, 2019 01:50
-
-
Save traviskaufman/d17c1e9b901b9e91da46185de596c219 to your computer and use it in GitHub Desktop.
Timestamp scraping script used for r/dataisbeautiful October 2019 Challenge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const fetch = require('node-fetch'); | |
const cheerio = require('cheerio'); | |
main().catch(err => console.error(err)); | |
async function main() { | |
const SCARE_RE = /^((\d{1,2}):(\d{1,2})(?::(\d{1,2}))?).*– (.+)$/; | |
const $ = cheerio.load(fs.readFileSync('./notebooks/movielist.html', 'utf8')); | |
const $movieNames = $('td.column-1 > a'); | |
const $diretors = $('td.column-2'); | |
const $jumpCounts = $('td.column-4'); | |
const movieUrls = $movieNames.map(function (i) { | |
const $el = $(this); | |
return { | |
title: $el.text().trim().replace(/\s+/gm, ' '), | |
director: $diretors.eq(i).text().trim(), | |
link: $el.attr('href'), | |
numScares: parseInt($jumpCounts.eq(i).text().trim(), 10), | |
}; | |
}); | |
const csv = [] | |
const errors = [] | |
for (let i = 0; i < movieUrls.length; i++) { | |
const movieUrl = movieUrls[i]; | |
if (movieUrl.numScares == 0) { | |
console.debug(`Skipping ${movieUrl.title} b/c it has no jump scares`); | |
continue; | |
} | |
console.debug('Fetching jump scare info for', movieUrl.title); | |
const html = await (await fetch(movieUrl.link)).text(); | |
const $ = cheerio.load(html); | |
const $ps = $('p'); | |
const ptext = $ps.map(function () { | |
return $(this).text().trim(); | |
}).toArray(); | |
const timestampMatches = ptext.map(t => t.match(SCARE_RE)).filter(Boolean); | |
if (timestampMatches.length != movieUrl.numScares) { | |
console.warn('NOTE: for movie', movieUrl.title, `number of timestamps found (${timestampMatches.length}) did not match number of scares specified (${movieUrl.numScares}). Go check this manually`); | |
errors.push({ | |
'Movie Name': movieUrl.title, | |
'Director': movieUrl.director, | |
'Number of timestamps found': timestampMatches.length, | |
'Number of timestamps specified': movieUrl.numScares, | |
}) | |
} | |
for (const m of timestampMatches) { | |
const ts = formatTimestamp(m[1]); | |
csv.push({ | |
'Movie Name': movieUrl.title, | |
'Director': movieUrl.director, | |
'Timestamp': ts, | |
'Timestamp Seconds': formattedTimestampToSeconds(ts), | |
'Description': m[5], | |
}) | |
} | |
console.log('Added', timestampMatches.length, 'timestamps from', movieUrl.title); | |
} | |
fs.writeFileSync('movietimestamps.json', JSON.stringify(csv, null, 2), 'utf-8'); | |
fs.writeFileSync('errors.json', JSON.stringify(errors, null, 2), 'utf-8'); | |
} | |
function formatTimestamp(ts) { | |
// These are only formatted as 'm:ss', 'h:mm:ss', or 'mm:ss' | |
const parts = ts.split(':'); | |
let hour = '0'; | |
let min = '0'; | |
let sec = '0'; | |
if (parts.length > 2) { | |
hour = parts[0]; | |
min = parts[1]; | |
sec = parts[2]; | |
} else { | |
min = parts[0]; | |
sec = parts[1]; | |
} | |
const joined = [hour, min, sec].map(p => p.length < 2 ? `0${p}` : p).join(':'); | |
return joined; | |
} | |
function formattedTimestampToSeconds(ts) { | |
const [hr, min, sec] = ts.split(':'); | |
return parseInt(hr, 10) * 60 * 60 + parseInt(min, 10) * 60 + parseInt(sec, 10); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment