Skip to main content
Version: 1.3

Crawl a sitemap

This example downloads and crawls the URLs from a sitemap.


Using CheerioCrawler:

const Apify = require('apify');

Apify.main(async () => {
// Add URLs to a RequestList from a sitemap
const sources = [{ requestsFromUrl: 'https://apify.com/sitemap.xml' }];
const requestList = await Apify.openRequestList('start-urls', sources);

// Function called for each URL
const handlePageFunction = async ({ request }) => {
console.log(request.url);
};

// Create a crawler that uses Cheerio
const crawler = new Apify.CheerioCrawler({
requestList,
handlePageFunction,
maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
});

// Run the crawler
await crawler.run();
});


Using PuppeteerCrawler:

To run this example on the Apify Platform, select the apify/actor-node-puppeteer-chrome image for your Dockerfile.

const Apify = require('apify');

Apify.main(async () => {
// Add URLs to a RequestList from a sitemap
const sources = [{ requestsFromUrl: 'https://apify.com/sitemap.xml' }];
const requestList = await Apify.openRequestList('start-urls', sources);

// Function called for each URL
const handlePageFunction = async ({ request }) => {
console.log(request.url);
};

// Create a crawler that runs Puppeteer
const crawler = new Apify.PuppeteerCrawler({
requestList,
handlePageFunction,
maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
});

// Run the crawler
await crawler.run();
});


Using PlaywrightCrawler:

To run this example on the Apify Platform, select the apify/actor-node-playwright-chrome image for your Dockerfile.

const Apify = require('apify');

Apify.main(async () => {
// Add URLs to a RequestList from a sitemap
const sources = [{ requestsFromUrl: 'https://apify.com/sitemap.xml' }];
const requestList = await Apify.openRequestList('start-urls', sources);

// Function called for each URL
const handlePageFunction = async ({ request }) => {
console.log(request.url);
};

// Create a crawler that runs Playwright
const crawler = new Apify.PlaywrightCrawler({
requestList,
handlePageFunction,
maxRequestsPerCrawl: 10, // Limitation for only 10 requests (do not use if you want to crawl a sitemap)
});

// Run the crawler
await crawler.run();
});