KtanPatel · July 31, 2024 07:23
diff --git a/scrapper-products.js b/scrapper-products.js
 // npm i axios playwright playwright-core

 const playwright = require("playwright");
 const fs = require("fs");
 const path = require("path");
 const axios = require("axios");

 async function scrapeItems(url) {
  const browser = await playwright.chromium.launch({
    headless: process.env.NODE_ENV === "production",
  });
  const page = await browser.newPage();
  await page.goto(url, { waitUntil: "domcontentloaded" });
  let items = [];

  // Function to extract items and images from a single page
  async function extractItemsFromPage() {
    // Select all item elements
    const itemElements = await page.$$(".work-list-col"); // Replace with actual selector

    for (const itemElement of itemElements) {
      const title = await itemElement.$eval(
        "h2.color-dark-blue", // Replace with actual selector
        (el) => el.textContent
      );
      const description = await itemElement.$eval("p", (el) => el.textContent);

      const imageSrc = await itemElement.$eval(
        ".col-md-8 img", // Replace with actual selector
        (el) => el.dataset.lazySrc || el.dataset.lazySrcset || el.src
      );

      console.log({ title, description, imageSrc: imageSrc });

      try {
        // Create the images directory if it doesn't exist
        const imagesDir = path.join(__dirname, "download", "images");
        if (!fs.existsSync(imagesDir)) {
          fs.mkdirSync(imagesDir,  { recursive: true });
        }

        // Generate a filename for the image
        const filename = path.basename(imageSrc);
        const filePath = path.join(imagesDir, filename);

        // Download the image
        const response = await axios.get(imageSrc, {
          responseType: "arraybuffer",
        });
        const buffer = Buffer.from(response.data, "binary");
        fs.writeFileSync(filePath, buffer);

        console.log(`Image saved: ${filePath}`);
      } catch (error) {
        console.error(`Error downloading image: ${error}`);
      }

      items.push({
        title,
        content: description,
        img: imageSrc,
        designTitle: "Website and Mobile App",
        href: "/contact",
      });
    }
  }

  // Initial page extraction
  await extractItemsFromPage();

  // Handle pagination logic
  const paginationContainer = await page.$(".pagination"); // Replace with actual selector
  const pageLinks = await paginationContainer.$$("a");

  // Find the anchor element with the highest page number
  let lastPage = 1;
  for (const pageLink of pageLinks) {
    const pageUrl = await pageLink.evaluate((el) => el.href);
    console.log({ pageUrl });
    const pageNumberMatch = pageUrl.match(/page\/(\d+)\/$/); // Replace with actual pagination route regex
    console.log({ pageNumberMatch });
    if (pageNumberMatch && parseInt(pageNumberMatch[1]) > lastPage) {
      lastPage = parseInt(pageNumberMatch[1]);
    }

    console.log({ lastPage });
  }

  // Loop through all pages
  for (let currentPage = 2; currentPage <= lastPage; currentPage++) {
    const nextPageUrl = `${url}/page/${currentPage}/`;  // Replace with actual pagination route
    await page.goto(nextPageUrl, { waitUntil: "domcontentloaded" });

    await extractItemsFromPage();
  }

  await browser.close();
  return items;
 }


 // Example usage
 const website = "https://www.example.com/products";
 scrapeItems(website)
  .then((items) => {
    console.log(items);
    // Save items to a file, database, etc. I'm saving it to a json file

    const jsonData = JSON.stringify(items);

    const downloadDirectory = "./download";
    fs.mkdirSync(downloadDirectory, { recursive: true }); // Create the directory if it doesn't exist
    fs.writeFileSync(path.join(downloadDirectory, "data.json"), jsonData); // save in data.json file

    // downloadImages(items, downloadDirectory);
  })
  .catch((error) => {
    console.error(error);
  });
	// npm i axios playwright playwright-core

	const playwright = require("playwright");
	const fs = require("fs");
	const path = require("path");
	const axios = require("axios");

	async function scrapeItems(url) {
	const browser = await playwright.chromium.launch({
	headless: process.env.NODE_ENV === "production",
	});
	const page = await browser.newPage();
	await page.goto(url, { waitUntil: "domcontentloaded" });
	let items = [];

	// Function to extract items and images from a single page
	async function extractItemsFromPage() {
	// Select all item elements
	const itemElements = await page.$$(".work-list-col"); // Replace with actual selector

	for (const itemElement of itemElements) {
	const title = await itemElement.$eval(
	"h2.color-dark-blue", // Replace with actual selector
	(el) => el.textContent
	);
	const description = await itemElement.$eval("p", (el) => el.textContent);

	const imageSrc = await itemElement.$eval(
	".col-md-8 img", // Replace with actual selector
	(el) => el.dataset.lazySrc \|\| el.dataset.lazySrcset \|\| el.src
	);

	console.log({ title, description, imageSrc: imageSrc });

	try {
	// Create the images directory if it doesn't exist
	const imagesDir = path.join(__dirname, "download", "images");
	if (!fs.existsSync(imagesDir)) {
	fs.mkdirSync(imagesDir, { recursive: true });
	}

	// Generate a filename for the image
	const filename = path.basename(imageSrc);
	const filePath = path.join(imagesDir, filename);

	// Download the image
	const response = await axios.get(imageSrc, {
	responseType: "arraybuffer",
	});
	const buffer = Buffer.from(response.data, "binary");
	fs.writeFileSync(filePath, buffer);

	console.log(`Image saved: ${filePath}`);
	} catch (error) {
	console.error(`Error downloading image: ${error}`);
	}

	items.push({
	title,
	content: description,
	img: imageSrc,
	designTitle: "Website and Mobile App",
	href: "/contact",
	});
	}
	}

	// Initial page extraction
	await extractItemsFromPage();

	// Handle pagination logic
	const paginationContainer = await page.$(".pagination"); // Replace with actual selector
	const pageLinks = await paginationContainer.$$("a");

	// Find the anchor element with the highest page number
	let lastPage = 1;
	for (const pageLink of pageLinks) {
	const pageUrl = await pageLink.evaluate((el) => el.href);
	console.log({ pageUrl });
	const pageNumberMatch = pageUrl.match(/page\/(\d+)\/$/); // Replace with actual pagination route regex
	console.log({ pageNumberMatch });
	if (pageNumberMatch && parseInt(pageNumberMatch[1]) > lastPage) {
	lastPage = parseInt(pageNumberMatch[1]);
	}

	console.log({ lastPage });
	}

	// Loop through all pages
	for (let currentPage = 2; currentPage <= lastPage; currentPage++) {
	const nextPageUrl = `${url}/page/${currentPage}/`; // Replace with actual pagination route
	await page.goto(nextPageUrl, { waitUntil: "domcontentloaded" });

	await extractItemsFromPage();
	}

	await browser.close();
	return items;
	}


	// Example usage
	const website = "https://www.example.com/products";
	scrapeItems(website)
	.then((items) => {
	console.log(items);
	// Save items to a file, database, etc. I'm saving it to a json file

	const jsonData = JSON.stringify(items);

	const downloadDirectory = "./download";
	fs.mkdirSync(downloadDirectory, { recursive: true }); // Create the directory if it doesn't exist
	fs.writeFileSync(path.join(downloadDirectory, "data.json"), jsonData); // save in data.json file

	// downloadImages(items, downloadDirectory);
	})
	.catch((error) => {
	console.error(error);
	});