Using Beautiful Soup
Beautiful Soup is a Python library for pulling data out of HTML and XML files. It provides simple methods and Pythonic idioms for navigating, searching, and modifying a website's element tree, allowing you to quickly extract the data you need.
Using BeautifulSoup in Actors
To create Actors which use BeautifulSoup, start from the BeautifulSoup & Python Actor template.
This Actor template already contains the BeautifulSoup library preinstalled, which means you can start using it right away.
Example Actor
This is a simple Actor that recursively scrapes titles from all linked websites, up to a maximum depth, starting from URLs in the Actor input.
It uses requests
to fetch the pages,
and BeautifulSoup to parse their content and read the page title and links to other pages.
src/main.py
from urllib.parse import urljoin
import requests
from apify import Actor
from bs4 import BeautifulSoup
async def main():
async with Actor:
# Read the Actor input
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }])
max_depth = actor_input.get('max_depth', 1)
if not start_urls:
Actor.log.info('No start URLs specified in Actor input, exiting...')
await Actor.exit()
# Enqueue the starting URLs in the default request queue
default_queue = await Actor.open_request_queue()
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing {url}...')
await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }})
# Process the requests in the queue one by one
while request := await default_queue.fetch_next_request():
url = request['url']
depth = request['userData']['depth']
Actor.log.info(f'Scraping {url}...')
try:
# Fetch the URL using `requests` and parse it using `BeautifulSoup`
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# If we haven't reached the max depth,
# look for nested links and enqueue their targets
if depth < max_depth:
for link in soup.find_all('a'):
link_href = link.get('href')
link_url = urljoin(url, link_href)
if link_url.startswith(('http://', 'https://')):
Actor.log.info(f'Enqueuing {link_url}...')
await default_queue.add_request({
'url': link_url,
'userData': {'depth': depth + 1 },
})
# Push the title of the page into the default dataset
title = soup.title.string if soup.title else None
await Actor.push_data({ 'url': url, 'title': title })
except:
Actor.log.exception(f'Cannot extract data from {url}.')
finally:
# Mark the request as handled so it's not processed again
await default_queue.mark_request_as_handled(request)