Puppeteer sitemap
This example demonstrates how to use PuppeteerCrawler
to crawl a list of web pages specified in a sitemap. The
crawler extracts the page title and URL from each page and stores them as a record in the default dataset. In local configuration, the results are
stored as JSON files in ./apify_storage/datasets/default
.
To run this example on the Apify Platform, select the
Node.js 12 + Chrome on Debian (apify/actor-node-chrome)
base image on the Source tab when configuring the actor.
const Apify = require('apify');
Apify.main(async () => {
const requestList = new Apify.RequestList({
sources: [{ requestsFromUrl: 'https://apify.com/sitemaps.xml' }],
});
await requestList.initialize();
const crawler = new Apify.PuppeteerCrawler({
requestList,
maxRequestsPerCrawl: 10,
handlePageFunction: async ({ page, request }) => {
console.log(`Processing ${request.url}...`);
await Apify.pushData({
url: request.url,
title: await page.title(),
html: await page.content(),
});
},
});
await crawler.run();
console.log('Done.');
});