Skip to content

Instantly share code, notes, and snippets.

@sueszli
Last active January 12, 2024 14:22
Show Gist options
  • Save sueszli/d5836b263d2840fa0c64983bfcf44620 to your computer and use it in GitHub Desktop.
Save sueszli/d5836b263d2840fa0c64983bfcf44620 to your computer and use it in GitHub Desktop.
kijiji.com scraper
import axios from 'axios'
import * as cheerio from 'cheerio'
import { assert } from 'console'
import open from 'open'
const main = async () => {
let url = process.argv[2]
assert(process.argv.length !== 2, 'illegal number of arguments')
assert(url, 'missing url as argument')
const links = []
while (true) {
let htmlStr = await axios.get(url).then((r) => r.data)
let $ = cheerio.load(htmlStr)
// get links from current page
const newLinks = []
$('a[data-testid="listing-link"]').each((i, a) => {
if (a.type === 'tag') {
const href = a.attribs && a.attribs.href
if (href) {
newLinks.push(href)
}
}
})
console.log('found ' + newLinks.length + ' links')
links.push(...newLinks)
// set link for next page, repeat
let nextButton = $('li[data-testid="pagination-next-link"] a')
if (!nextButton.length) {
console.log('reached last page')
break
}
const nextButtonHref = nextButton.attr('href')
if (!nextButtonHref) {
console.error('next button has no href')
process.exit(1)
}
const nextPageLink = new URL(nextButtonHref, url).href
url = nextPageLink
}
console.log('press enter key to open the ' + links.length + ' scraped links in your default browser')
await new Promise((resolve) => process.stdin.once('data', resolve))
// iterate through links
for (let i = 0; i < links.length; i++) {
const l = links[i]
const lurl = new URL(l, url).href
console.log('opening: ' + lurl)
open(lurl)
}
process.exit(0)
}
main()
const { join } = require("path");
module.exports = {
cacheDirectory: join(__dirname, ".cache", "puppeteer"),
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment