Skip to content

Instantly share code, notes, and snippets.

@KtanPatel
Created July 31, 2024 07:23
Show Gist options
  • Save KtanPatel/573c9b838ca5f3b27f6e6fc1479cf099 to your computer and use it in GitHub Desktop.
Save KtanPatel/573c9b838ca5f3b27f6e6fc1479cf099 to your computer and use it in GitHub Desktop.
Product list scrapping through all the pages (pagination) and save it to json file and download the images: nodejs + playwright
// npm i axios playwright playwright-core
const playwright = require("playwright");
const fs = require("fs");
const path = require("path");
const axios = require("axios");
async function scrapeItems(url) {
const browser = await playwright.chromium.launch({
headless: process.env.NODE_ENV === "production",
});
const page = await browser.newPage();
await page.goto(url, { waitUntil: "domcontentloaded" });
let items = [];
// Function to extract items and images from a single page
async function extractItemsFromPage() {
// Select all item elements
const itemElements = await page.$$(".work-list-col"); // Replace with actual selector
for (const itemElement of itemElements) {
const title = await itemElement.$eval(
"h2.color-dark-blue", // Replace with actual selector
(el) => el.textContent
);
const description = await itemElement.$eval("p", (el) => el.textContent);
const imageSrc = await itemElement.$eval(
".col-md-8 img", // Replace with actual selector
(el) => el.dataset.lazySrc || el.dataset.lazySrcset || el.src
);
console.log({ title, description, imageSrc: imageSrc });
try {
// Create the images directory if it doesn't exist
const imagesDir = path.join(__dirname, "download", "images");
if (!fs.existsSync(imagesDir)) {
fs.mkdirSync(imagesDir, { recursive: true });
}
// Generate a filename for the image
const filename = path.basename(imageSrc);
const filePath = path.join(imagesDir, filename);
// Download the image
const response = await axios.get(imageSrc, {
responseType: "arraybuffer",
});
const buffer = Buffer.from(response.data, "binary");
fs.writeFileSync(filePath, buffer);
console.log(`Image saved: ${filePath}`);
} catch (error) {
console.error(`Error downloading image: ${error}`);
}
items.push({
title,
content: description,
img: imageSrc,
designTitle: "Website and Mobile App",
href: "/contact",
});
}
}
// Initial page extraction
await extractItemsFromPage();
// Handle pagination logic
const paginationContainer = await page.$(".pagination"); // Replace with actual selector
const pageLinks = await paginationContainer.$$("a");
// Find the anchor element with the highest page number
let lastPage = 1;
for (const pageLink of pageLinks) {
const pageUrl = await pageLink.evaluate((el) => el.href);
console.log({ pageUrl });
const pageNumberMatch = pageUrl.match(/page\/(\d+)\/$/); // Replace with actual pagination route regex
console.log({ pageNumberMatch });
if (pageNumberMatch && parseInt(pageNumberMatch[1]) > lastPage) {
lastPage = parseInt(pageNumberMatch[1]);
}
console.log({ lastPage });
}
// Loop through all pages
for (let currentPage = 2; currentPage <= lastPage; currentPage++) {
const nextPageUrl = `${url}/page/${currentPage}/`; // Replace with actual pagination route
await page.goto(nextPageUrl, { waitUntil: "domcontentloaded" });
await extractItemsFromPage();
}
await browser.close();
return items;
}
// Example usage
const website = "https://www.example.com/products";
scrapeItems(website)
.then((items) => {
console.log(items);
// Save items to a file, database, etc. I'm saving it to a json file
const jsonData = JSON.stringify(items);
const downloadDirectory = "./download";
fs.mkdirSync(downloadDirectory, { recursive: true }); // Create the directory if it doesn't exist
fs.writeFileSync(path.join(downloadDirectory, "data.json"), jsonData); // save in data.json file
// downloadImages(items, downloadDirectory);
})
.catch((error) => {
console.error(error);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment