Skip to content

feat: add respect_robots_txt_file option #1162

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Apr 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions docs/examples/code_examples/respect_robots_txt_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import asyncio

from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)


async def main() -> None:
# Initialize the crawler with robots.txt compliance enabled
crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# Start the crawler with the specified URLs
# The crawler will check the robots.txt file before making requests
# In this example, 'https://news.ycombinator.com/login' will be skipped
# because it's disallowed in the site's robots.txt file
await crawler.run(
['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
)


if __name__ == '__main__':
asyncio.run(main())
21 changes: 21 additions & 0 deletions docs/examples/respect_robots_txt_file.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
id: respect-robots-txt-file
title: Respect robots.txt file
---

import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';

This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.

To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file.

As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped.

The code below demonstrates this behavior using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>:

<RunnableCodeBlock className="language-python" language="python">
{RespectRobotsTxt}
</RunnableCodeBlock>
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ dependencies = [
"eval-type-backport>=0.2.0",
"httpx[brotli,http2,zstd]>=0.27.0",
"more-itertools>=10.2.0",
"protego>=0.4.0",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's fun to see another scrapy project here, but I guess that it guarantees some stability, so... all good.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I was planning to use RobotFileParser, but it doesn't support Google's specification. 😞

"psutil>=6.0.0",
"pydantic-settings>=2.2.0,<2.7.0",
"pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2",
Expand Down Expand Up @@ -236,7 +237,9 @@ module = [
"functions_framework", # Example code shows deploy on Google Cloud.
"jaro", # Untyped and stubs not available
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
"protego", # Untyped and stubs not available
"sklearn.linear_model", # Untyped and stubs not available
"sortedcollections", # Untyped and stubs not available
"cookiecutter.*", # Untyped and stubs not available
"inquirer.*", # Untyped and stubs not available
]
Expand Down
85 changes: 85 additions & 0 deletions src/crawlee/_utils/robots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from protego import Protego
from yarl import URL

from crawlee._utils.web import is_status_code_client_error

if TYPE_CHECKING:
from typing_extensions import Self

from crawlee.http_clients import HttpClient
from crawlee.proxy_configuration import ProxyInfo


class RobotsTxtFile:
def __init__(self, url: str, robots: Protego) -> None:
self._robots = robots
self._original_url = URL(url).origin()

@classmethod
async def from_content(cls, url: str, content: str) -> Self:
"""Create a `RobotsTxtFile` instance from the given content.

Args:
url: The URL associated with the robots.txt file.
content: The raw string content of the robots.txt file to be parsed.
"""
robots = Protego.parse(content)
return cls(url, robots)

@classmethod
async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
"""Determine the location of a robots.txt file for a URL and fetch it.

Args:
url: The URL whose domain will be used to find the corresponding robots.txt file.
http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
"""
robots_url = URL(url).with_path('/robots.txt')
return await cls.load(str(robots_url), http_client, proxy_info)

@classmethod
async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
"""Load the robots.txt file for a given URL.

Args:
url: The direct URL of the robots.txt file to be loaded.
http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
"""
response = await http_client.send_request(url, proxy_info=proxy_info)
body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()

robots = Protego.parse(body.decode('utf-8'))

return cls(url, robots)

def is_allowed(self, url: str, user_agent: str = '*') -> bool:
"""Check if the given URL is allowed for the given user agent.

Args:
url: The URL to check against the robots.txt rules.
user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent.
"""
check_url = URL(url)
if check_url.origin() != self._original_url:
return True
return bool(self._robots.can_fetch(str(check_url), user_agent))

def get_sitemaps(self) -> list[str]:
"""Get the list of sitemaps urls from the robots.txt file."""
return list(self._robots.sitemaps)

def get_crawl_delay(self, user_agent: str = '*') -> int | None:
"""Get the crawl delay for the given user agent.

Args:
user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any
user-agent.
"""
crawl_delay = self._robots.crawl_delay(user_agent)
return int(crawl_delay) if crawl_delay is not None else None
7 changes: 7 additions & 0 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,19 @@ async def extract_links(
requests = list[Request]()
base_user_data = user_data or {}

robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)

for link in self._parser.find_links(parsed_content, selector=selector):
url = link
if not is_url_absolute(url):
base_url = context.request.loaded_url or context.request.url
url = convert_to_absolute_url(base_url, url)

if robots_txt_file and not robots_txt_file.is_allowed(url):
# TODO: https://github.com/apify/crawlee-python/issues/1160
# add processing with on_skipped_request hook
continue

request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)

if transform_request_function:
Expand Down
82 changes: 81 additions & 1 deletion src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
from urllib.parse import ParseResult, urlparse
from weakref import WeakKeyDictionary

from cachetools import LRUCache
from tldextract import TLDExtract
from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
from yarl import URL

from crawlee import EnqueueStrategy, Glob, service_locator
from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
Expand All @@ -32,6 +34,7 @@
SendRequestFunction,
)
from crawlee._utils.docs import docs_group
from crawlee._utils.robots import RobotsTxtFile
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee._utils.wait import wait_for
from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
Expand Down Expand Up @@ -158,6 +161,10 @@ class _BasicCrawlerOptions(TypedDict):
"""A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by
subclasses rather than direct instantiation of `BasicCrawler`."""

respect_robots_txt_file: NotRequired[bool]
"""If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,
and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""


class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
"""Generic options the `BasicCrawler` constructor."""
Expand Down Expand Up @@ -238,6 +245,7 @@ def __init__(
keep_alive: bool = False,
configure_logging: bool = True,
statistics_log_format: Literal['table', 'inline'] = 'table',
respect_robots_txt_file: bool = False,
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
_logger: logging.Logger | None = None,
Expand Down Expand Up @@ -280,6 +288,9 @@ def __init__(
configure_logging: If True, the crawler will set up logging infrastructure automatically.
statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
outputs statistics as plain text log messages.
respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
via `EnqueueLinksFunction`
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
Expand Down Expand Up @@ -335,6 +346,7 @@ def __init__(
self._max_requests_per_crawl = max_requests_per_crawl
self._max_session_rotations = max_session_rotations
self._max_crawl_depth = max_crawl_depth
self._respect_robots_txt_file = respect_robots_txt_file

# Timeouts
self._request_handler_timeout = request_handler_timeout
Expand Down Expand Up @@ -371,6 +383,8 @@ def __init__(
self._additional_context_managers = _additional_context_managers or []

# Internal, not explicitly configurable components
self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
self._robots_txt_lock = asyncio.Lock()
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
self._snapshotter = Snapshotter.from_config(config)
self._autoscaled_pool = AutoscaledPool(
Expand Down Expand Up @@ -645,10 +659,25 @@ async def add_requests(
wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
"""
allowed_requests = []
skipped = []

for request in requests:
check_url = request.url if isinstance(request, Request) else request
if await self._is_allowed_based_on_robots_txt_file(check_url):
allowed_requests.append(request)
else:
skipped.append(request)

if skipped:
# TODO: https://github.com/apify/crawlee-python/issues/1160
# add processing with on_skipped_request hook
self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')

request_manager = await self.get_request_manager()

await request_manager.add_requests_batched(
requests=requests,
requests=allowed_requests,
batch_size=batch_size,
wait_time_between_batches=wait_time_between_batches,
wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,
Expand Down Expand Up @@ -1080,6 +1109,22 @@ async def __run_task_function(self) -> None:
if request is None:
return

if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
self._logger.warning(
f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
)
await wait_for(
lambda: request_manager.mark_request_as_handled(request),
timeout=self._internal_timeout,
timeout_message='Marking request as handled timed out after '
f'{self._internal_timeout.total_seconds()} seconds',
logger=self._logger,
max_retries=3,
)
# TODO: https://github.com/apify/crawlee-python/issues/1160
# add processing with on_skipped_request hook
return

if request.session_id:
session = await self._get_session_by_id(request.session_id)
else:
Expand Down Expand Up @@ -1263,3 +1308,38 @@ def _check_request_collision(self, request: Request, session: Session | None) ->
raise RequestCollisionError(
f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool'
)

async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
"""Check if the URL is allowed based on the robots.txt file.

Args:
url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted.
"""
if not self._respect_robots_txt_file:
return True
robots_txt_file = await self._get_robots_txt_file_for_url(url)
return not robots_txt_file or robots_txt_file.is_allowed(url)

async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
"""Get the RobotsTxtFile for a given URL.

Args:
url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
"""
if not self._respect_robots_txt_file:
return None
origin_url = str(URL(url).origin())
robots_txt_file = self._robots_txt_file_cache.get(origin_url)
if robots_txt_file:
return robots_txt_file

async with self._robots_txt_lock:
# Check again if the robots.txt file is already cached after acquiring the lock
robots_txt_file = self._robots_txt_file_cache.get(origin_url)
if robots_txt_file:
return robots_txt_file

# If not cached, fetch the robots.txt file
robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
self._robots_txt_file_cache[origin_url] = robots_txt_file
return robots_txt_file
7 changes: 7 additions & 0 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@ async def extract_links(

elements = await context.page.query_selector_all(selector)

robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)

for element in elements:
url = await element.get_attribute('href')

Expand All @@ -300,6 +302,11 @@ async def extract_links(
base_url = context.request.loaded_url or context.request.url
url = convert_to_absolute_url(base_url, url)

if robots_txt_file and not robots_txt_file.is_allowed(url):
# TODO: https://github.com/apify/crawlee-python/issues/1160
# add processing with on_skipped_request hook
continue

request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})

if transform_request_function:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from logging import getLogger
from typing import TYPE_CHECKING

from sortedcollections import ValueSortedDict # type: ignore[import-untyped]
from sortedcollections import ValueSortedDict
from typing_extensions import override

from crawlee._types import StorageTypes
Expand Down
Loading
Loading