apify · janbuchar · Apr 24, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025
diff --git a/docs/examples/code_examples/respect_robots_txt_file.py b/docs/examples/code_examples/respect_robots_txt_file.py
@@ -0,0 +1,27 @@
+import asyncio
+
+from crawlee.crawlers import (
+    BeautifulSoupCrawler,
+    BeautifulSoupCrawlingContext,
+)
+
+
+async def main() -> None:
+    # Initialize the crawler with robots.txt compliance enabled
+    crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+    # Start the crawler with the specified URLs
+    # The crawler will check the robots.txt file before making requests
+    # In this example, 'https://news.ycombinator.com/login' will be skipped
+    # because it's disallowed in the site's robots.txt file
+    await crawler.run(
+        ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
+    )
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/respect_robots_txt_file.mdx b/docs/examples/respect_robots_txt_file.mdx
@@ -0,0 +1,21 @@
+---
+id: respect-robots-txt-file
+title: Respect robots.txt file
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+
+import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';
+
+This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.
+
+To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file.
+
+As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped.
+
+The code below demonstrates this behavior using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {RespectRobotsTxt}
+</RunnableCodeBlock>
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
     "eval-type-backport>=0.2.0",
     "httpx[brotli,http2,zstd]>=0.27.0",
     "more-itertools>=10.2.0",
+    "protego>=0.4.0",
     "psutil>=6.0.0",
     "pydantic-settings>=2.2.0,<2.7.0",
     "pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2",
@@ -236,7 +237,9 @@ module = [
     "functions_framework",          # Example code shows deploy on Google Cloud.
     "jaro",                         # Untyped and stubs not available
     "loguru",                       # Example code shows integration of loguru and crawlee for JSON logging.
+    "protego",                      # Untyped and stubs not available
     "sklearn.linear_model",         # Untyped and stubs not available
+    "sortedcollections",            # Untyped and stubs not available
     "cookiecutter.*",               # Untyped and stubs not available
     "inquirer.*",                   # Untyped and stubs not available
 ]

diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from protego import Protego
+from yarl import URL
+
+from crawlee._utils.web import is_status_code_client_error
+
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+    from crawlee.http_clients import HttpClient
+    from crawlee.proxy_configuration import ProxyInfo
+
+
+class RobotsTxtFile:
+    def __init__(self, url: str, robots: Protego) -> None:
+        self._robots = robots
+        self._original_url = URL(url).origin()
+
+    @classmethod
+    async def from_content(cls, url: str, content: str) -> Self:
+        """Create a `RobotsTxtFile` instance from the given content.
+
+        Args:
+            url: The URL associated with the robots.txt file.
+            content: The raw string content of the robots.txt file to be parsed.
+        """
+        robots = Protego.parse(content)
+        return cls(url, robots)
+
+    @classmethod
+    async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
+        """Determine the location of a robots.txt file for a URL and fetch it.
+
+        Args:
+            url: The URL whose domain will be used to find the corresponding robots.txt file.
+            http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
+            proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
+        """
+        robots_url = URL(url).with_path('/robots.txt')
+        return await cls.load(str(robots_url), http_client, proxy_info)
+
+    @classmethod
+    async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
+        """Load the robots.txt file for a given URL.
+
+        Args:
+            url: The direct URL of the robots.txt file to be loaded.
+            http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
+            proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
+        """
+        response = await http_client.send_request(url, proxy_info=proxy_info)
+        body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
+
+        robots = Protego.parse(body.decode('utf-8'))
+
+        return cls(url, robots)
+
+    def is_allowed(self, url: str, user_agent: str = '*') -> bool:
+        """Check if the given URL is allowed for the given user agent.
+
+        Args:
+            url: The URL to check against the robots.txt rules.
+            user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent.
+        """
+        check_url = URL(url)
+        if check_url.origin() != self._original_url:
+            return True
+        return bool(self._robots.can_fetch(str(check_url), user_agent))
+
+    def get_sitemaps(self) -> list[str]:
+        """Get the list of sitemaps urls from the robots.txt file."""
+        return list(self._robots.sitemaps)
+
+    def get_crawl_delay(self, user_agent: str = '*') -> int | None:
+        """Get the crawl delay for the given user agent.
+
+        Args:
+            user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any
+                user-agent.
+        """
+        crawl_delay = self._robots.crawl_delay(user_agent)
+        return int(crawl_delay) if crawl_delay is not None else None
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -159,12 +159,19 @@ async def extract_links(
             requests = list[Request]()
             base_user_data = user_data or {}
 
+            robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
+
             for link in self._parser.find_links(parsed_content, selector=selector):
                 url = link
                 if not is_url_absolute(url):
                     base_url = context.request.loaded_url or context.request.url
                     url = convert_to_absolute_url(base_url, url)
 
+                if robots_txt_file and not robots_txt_file.is_allowed(url):
+                    # TODO: https://github.com/apify/crawlee-python/issues/1160
+                    # add processing with on_skipped_request hook
+                    continue
+
                 request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
 
                 if transform_request_function:

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -17,8 +17,10 @@
 from urllib.parse import ParseResult, urlparse
 from weakref import WeakKeyDictionary
 
+from cachetools import LRUCache
 from tldextract import TLDExtract
 from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
+from yarl import URL
 
 from crawlee import EnqueueStrategy, Glob, service_locator
 from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
@@ -32,6 +34,7 @@
     SendRequestFunction,
 )
 from crawlee._utils.docs import docs_group
+from crawlee._utils.robots import RobotsTxtFile
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
 from crawlee._utils.wait import wait_for
 from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
@@ -158,6 +161,10 @@ class _BasicCrawlerOptions(TypedDict):
     """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by
     subclasses rather than direct instantiation of `BasicCrawler`."""
 
+    respect_robots_txt_file: NotRequired[bool]
+    """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,
+    and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""
+
 
 class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
     """Generic options the `BasicCrawler` constructor."""
@@ -238,6 +245,7 @@ def __init__(
         keep_alive: bool = False,
         configure_logging: bool = True,
         statistics_log_format: Literal['table', 'inline'] = 'table',
+        respect_robots_txt_file: bool = False,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
         _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
         _logger: logging.Logger | None = None,
@@ -280,6 +288,9 @@ def __init__(
             configure_logging: If True, the crawler will set up logging infrastructure automatically.
             statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
                 outputs statistics as plain text log messages.
+            respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
+                for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
+                via `EnqueueLinksFunction`
             _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
                 Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
             _additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -335,6 +346,7 @@ def __init__(
         self._max_requests_per_crawl = max_requests_per_crawl
         self._max_session_rotations = max_session_rotations
         self._max_crawl_depth = max_crawl_depth
+        self._respect_robots_txt_file = respect_robots_txt_file
 
         # Timeouts
         self._request_handler_timeout = request_handler_timeout
@@ -371,6 +383,8 @@ def __init__(
         self._additional_context_managers = _additional_context_managers or []
 
         # Internal, not explicitly configurable components
+        self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
+        self._robots_txt_lock = asyncio.Lock()
         self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
         self._snapshotter = Snapshotter.from_config(config)
         self._autoscaled_pool = AutoscaledPool(
@@ -645,10 +659,25 @@ async def add_requests(
             wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
             wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
         """
+        allowed_requests = []
+        skipped = []
+
+        for request in requests:
+            check_url = request.url if isinstance(request, Request) else request
+            if await self._is_allowed_based_on_robots_txt_file(check_url):
+                allowed_requests.append(request)
+            else:
+                skipped.append(request)
+
+        if skipped:
+            # TODO: https://github.com/apify/crawlee-python/issues/1160
+            # add processing with on_skipped_request hook
+            self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')
+
         request_manager = await self.get_request_manager()
 
         await request_manager.add_requests_batched(
-            requests=requests,
+            requests=allowed_requests,
             batch_size=batch_size,
             wait_time_between_batches=wait_time_between_batches,
             wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,
@@ -1080,6 +1109,22 @@ async def __run_task_function(self) -> None:
         if request is None:
             return
 
+        if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
+            self._logger.warning(
+                f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
+            )
+            await wait_for(
+                lambda: request_manager.mark_request_as_handled(request),
+                timeout=self._internal_timeout,
+                timeout_message='Marking request as handled timed out after '
+                f'{self._internal_timeout.total_seconds()} seconds',
+                logger=self._logger,
+                max_retries=3,
+            )
+            # TODO: https://github.com/apify/crawlee-python/issues/1160
+            # add processing with on_skipped_request hook
+            return
+
         if request.session_id:
             session = await self._get_session_by_id(request.session_id)
         else:
@@ -1263,3 +1308,38 @@ def _check_request_collision(self, request: Request, session: Session | None) ->
             raise RequestCollisionError(
                 f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool'
             )
+
+    async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
+        """Check if the URL is allowed based on the robots.txt file.
+
+        Args:
+            url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted.
+        """
+        if not self._respect_robots_txt_file:
+            return True
+        robots_txt_file = await self._get_robots_txt_file_for_url(url)
+        return not robots_txt_file or robots_txt_file.is_allowed(url)
+
+    async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
+        """Get the RobotsTxtFile for a given URL.
+
+        Args:
+            url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
+        """
+        if not self._respect_robots_txt_file:
+            return None
+        origin_url = str(URL(url).origin())
+        robots_txt_file = self._robots_txt_file_cache.get(origin_url)
+        if robots_txt_file:
+            return robots_txt_file
+
+        async with self._robots_txt_lock:
+            # Check again if the robots.txt file is already cached after acquiring the lock
+            robots_txt_file = self._robots_txt_file_cache.get(origin_url)
+            if robots_txt_file:
+                return robots_txt_file
+
+            # If not cached, fetch the robots.txt file
+            robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
+            self._robots_txt_file_cache[origin_url] = robots_txt_file
+            return robots_txt_file
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -290,6 +290,8 @@ async def extract_links(
 
             elements = await context.page.query_selector_all(selector)
 
+            robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
+
             for element in elements:
                 url = await element.get_attribute('href')
 
@@ -300,6 +302,11 @@ async def extract_links(
                         base_url = context.request.loaded_url or context.request.url
                         url = convert_to_absolute_url(base_url, url)
 
+                    if robots_txt_file and not robots_txt_file.is_allowed(url):
+                        # TODO: https://github.com/apify/crawlee-python/issues/1160
+                        # add processing with on_skipped_request hook
+                        continue
+
                     request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
 
                     if transform_request_function:

diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py
@@ -8,7 +8,7 @@
 from logging import getLogger
 from typing import TYPE_CHECKING
 
-from sortedcollections import ValueSortedDict  # type: ignore[import-untyped]
+from sortedcollections import ValueSortedDict
 from typing_extensions import override
 
 from crawlee._types import StorageTypes