Created
July 31, 2024 07:23
-
-
Save KtanPatel/573c9b838ca5f3b27f6e6fc1479cf099 to your computer and use it in GitHub Desktop.
Product list scrapping through all the pages (pagination) and save it to json file and download the images: nodejs + playwright
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// npm i axios playwright playwright-core | |
const playwright = require("playwright"); | |
const fs = require("fs"); | |
const path = require("path"); | |
const axios = require("axios"); | |
async function scrapeItems(url) { | |
const browser = await playwright.chromium.launch({ | |
headless: process.env.NODE_ENV === "production", | |
}); | |
const page = await browser.newPage(); | |
await page.goto(url, { waitUntil: "domcontentloaded" }); | |
let items = []; | |
// Function to extract items and images from a single page | |
async function extractItemsFromPage() { | |
// Select all item elements | |
const itemElements = await page.$$(".work-list-col"); // Replace with actual selector | |
for (const itemElement of itemElements) { | |
const title = await itemElement.$eval( | |
"h2.color-dark-blue", // Replace with actual selector | |
(el) => el.textContent | |
); | |
const description = await itemElement.$eval("p", (el) => el.textContent); | |
const imageSrc = await itemElement.$eval( | |
".col-md-8 img", // Replace with actual selector | |
(el) => el.dataset.lazySrc || el.dataset.lazySrcset || el.src | |
); | |
console.log({ title, description, imageSrc: imageSrc }); | |
try { | |
// Create the images directory if it doesn't exist | |
const imagesDir = path.join(__dirname, "download", "images"); | |
if (!fs.existsSync(imagesDir)) { | |
fs.mkdirSync(imagesDir, { recursive: true }); | |
} | |
// Generate a filename for the image | |
const filename = path.basename(imageSrc); | |
const filePath = path.join(imagesDir, filename); | |
// Download the image | |
const response = await axios.get(imageSrc, { | |
responseType: "arraybuffer", | |
}); | |
const buffer = Buffer.from(response.data, "binary"); | |
fs.writeFileSync(filePath, buffer); | |
console.log(`Image saved: ${filePath}`); | |
} catch (error) { | |
console.error(`Error downloading image: ${error}`); | |
} | |
items.push({ | |
title, | |
content: description, | |
img: imageSrc, | |
designTitle: "Website and Mobile App", | |
href: "/contact", | |
}); | |
} | |
} | |
// Initial page extraction | |
await extractItemsFromPage(); | |
// Handle pagination logic | |
const paginationContainer = await page.$(".pagination"); // Replace with actual selector | |
const pageLinks = await paginationContainer.$$("a"); | |
// Find the anchor element with the highest page number | |
let lastPage = 1; | |
for (const pageLink of pageLinks) { | |
const pageUrl = await pageLink.evaluate((el) => el.href); | |
console.log({ pageUrl }); | |
const pageNumberMatch = pageUrl.match(/page\/(\d+)\/$/); // Replace with actual pagination route regex | |
console.log({ pageNumberMatch }); | |
if (pageNumberMatch && parseInt(pageNumberMatch[1]) > lastPage) { | |
lastPage = parseInt(pageNumberMatch[1]); | |
} | |
console.log({ lastPage }); | |
} | |
// Loop through all pages | |
for (let currentPage = 2; currentPage <= lastPage; currentPage++) { | |
const nextPageUrl = `${url}/page/${currentPage}/`; // Replace with actual pagination route | |
await page.goto(nextPageUrl, { waitUntil: "domcontentloaded" }); | |
await extractItemsFromPage(); | |
} | |
await browser.close(); | |
return items; | |
} | |
// Example usage | |
const website = "https://www.example.com/products"; | |
scrapeItems(website) | |
.then((items) => { | |
console.log(items); | |
// Save items to a file, database, etc. I'm saving it to a json file | |
const jsonData = JSON.stringify(items); | |
const downloadDirectory = "./download"; | |
fs.mkdirSync(downloadDirectory, { recursive: true }); // Create the directory if it doesn't exist | |
fs.writeFileSync(path.join(downloadDirectory, "data.json"), jsonData); // save in data.json file | |
// downloadImages(items, downloadDirectory); | |
}) | |
.catch((error) => { | |
console.error(error); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment