From 427b00a4adc05b2170269128c12c9612b5553c2f Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 15:43:43 +0000
Subject: [PATCH 01/20] basic_robots_allow

---
 src/crawlee/_utils/robots.py                  | 59 +++++++++++++++++++
 src/crawlee/crawlers/_basic/_basic_crawler.py | 29 +++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 src/crawlee/_utils/robots.py

diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
new file mode 100644
index 0000000000..d04ae83a0c
--- /dev/null
+++ b/src/crawlee/_utils/robots.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from asyncio import to_thread
+from typing import TYPE_CHECKING
+from urllib.robotparser import RobotFileParser
+
+from yarl import URL
+
+from crawlee._utils.web import is_status_code_client_error
+
+if TYPE_CHECKING:
+    from crawlee.http_clients import HttpClient
+    from crawlee.proxy_configuration import ProxyInfo
+
+
+class RobotsTxtFile:
+    def __init__(self, robots: RobotFileParser) -> None:
+        self._robots = robots
+
+    @staticmethod
+    async def from_content(url: str, content: str) -> RobotsTxtFile:
+        robots = RobotFileParser(url=url)
+        robots.parse(content.splitlines())
+        return RobotsTxtFile(robots)
+
+    @staticmethod
+    async def find(
+        url: str, proxy_info: ProxyInfo | None = None, http_client: HttpClient | None = None
+    ) -> RobotsTxtFile:
+        """Find the robots.txt file for a given URL."""
+        robots_url = URL(url).with_path('/robots.txt')
+        return await RobotsTxtFile.load(str(robots_url), proxy_info, http_client)
+
+    @staticmethod
+    async def load(
+        url: str, proxy_info: ProxyInfo | None = None, http_client: HttpClient | None = None
+    ) -> RobotsTxtFile:
+        """Load the robots.txt file for a given URL."""
+        robots = RobotFileParser(url=url)
+        if http_client is None:
+            await to_thread(robots.read)
+        else:
+            response = await http_client.send_request(url, proxy_info=proxy_info)
+            if is_status_code_client_error(response.status_code):
+                robots.allow_all = True  # type: ignore[attr-defined] # allow_all is a valid RobotFileParser
+        return RobotsTxtFile(robots)
+
+    def is_allowed(self, url: str, user_agent: str = '*') -> bool:
+        """Check if the given URL is allowed for the given user agent."""
+        return self._robots.can_fetch(user_agent, url)
+
+    def get_sitemaps(self) -> list[str]:
+        """Get the list of sitemaps from the robots.txt file."""
+        return self._robots.site_maps() or []
+
+    def get_crawl_delay(self, user_agent: str = '*') -> int | None:
+        """Get the crawl delay for the given user agent."""
+        crawl_delay = self._robots.crawl_delay(user_agent)
+        return int(crawl_delay) if crawl_delay is not None else None
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index a196f5e251..295c959f8a 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -17,8 +17,10 @@
 from urllib.parse import ParseResult, urlparse
 from weakref import WeakKeyDictionary
 
+from cachetools import LRUCache
 from tldextract import TLDExtract
 from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never
+from yarl import URL
 
 from crawlee import EnqueueStrategy, Glob, service_locator
 from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus
@@ -32,6 +34,7 @@
     SendRequestFunction,
 )
 from crawlee._utils.docs import docs_group
+from crawlee._utils.robots import RobotsTxtFile
 from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
 from crawlee._utils.wait import wait_for
 from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error
@@ -158,6 +161,9 @@ class _BasicCrawlerOptions(TypedDict):
     """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by
     subclasses rather than direct instantiation of `BasicCrawler`."""
 
+    respect_robots_txt_file: NotRequired[bool]
+    """"""
+
 
 class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
     """Generic options the `BasicCrawler` constructor."""
@@ -238,6 +244,7 @@ def __init__(
         keep_alive: bool = False,
         configure_logging: bool = True,
         statistics_log_format: Literal['table', 'inline'] = 'table',
+        respect_robots_txt_file: bool = False,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
         _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
         _logger: logging.Logger | None = None,
@@ -280,6 +287,7 @@ def __init__(
             configure_logging: If True, the crawler will set up logging infrastructure automatically.
             statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
                 outputs statistics as plain text log messages.
+            respect_robots_txt_file: If True, the crawler will respect the robots.txt file of the target website.
             _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
                 Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
             _additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -335,6 +343,7 @@ def __init__(
         self._max_requests_per_crawl = max_requests_per_crawl
         self._max_session_rotations = max_session_rotations
         self._max_crawl_depth = max_crawl_depth
+        self._respect_robots_txt_file = respect_robots_txt_file
 
         # Timeouts
         self._request_handler_timeout = request_handler_timeout
@@ -371,6 +380,7 @@ def __init__(
         self._additional_context_managers = _additional_context_managers or []
 
         # Internal, not explicitly configurable components
+        self.robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
         self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
         self._snapshotter = Snapshotter.from_config(config)
         self._autoscaled_pool = AutoscaledPool(
@@ -1265,3 +1275,22 @@ def _check_request_collision(self, request: Request, session: Session | None) ->
             raise RequestCollisionError(
                 f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool'
             )
+
+    async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
+        """Check if the URL is allowed based on the robots.txt file."""
+        if not self._respect_robots_txt_file:
+            return True
+        robots_txt_file = await self._get_robots_txt_file_for_url(url)
+        return not robots_txt_file or robots_txt_file.is_allowed(url)
+
+    async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
+        """Get the RobotsTxtFile for a given URL."""
+        if not self._respect_robots_txt_file:
+            return None
+        origin_url = str(URL(url).origin())
+        robots_txt_file = self.robots_txt_file_cache[origin_url]
+        if robots_txt_file:
+            return robots_txt_file
+        robots_txt_file = await RobotsTxtFile.find(url, None, self._http_client)
+        self.robots_txt_file_cache[origin_url] = robots_txt_file
+        return robots_txt_file

From 638b5be245851b91fd578dacd06ebb6c9d92e826 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 18:59:27 +0000
Subject: [PATCH 02/20] add respect robots_txt_file

---
 .../_abstract_http/_abstract_http_crawler.py  |  7 +++++
 src/crawlee/crawlers/_basic/_basic_crawler.py | 31 ++++++++++++++++++-
 .../_playwright/_playwright_crawler.py        |  7 +++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
index 04e16683f6..1ca3600542 100644
--- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
+++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -159,12 +159,19 @@ async def extract_links(
             requests = list[Request]()
             base_user_data = user_data or {}
 
+            robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
+
             for link in self._parser.find_links(parsed_content, selector=selector):
                 url = link
                 if not is_url_absolute(url):
                     base_url = context.request.loaded_url or context.request.url
                     url = convert_to_absolute_url(base_url, url)
 
+                if robots_txt_file and not robots_txt_file.is_allowed(url):
+                    # add processing with on_skiped_request callback or handler?
+                    context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.')
+                    continue
+
                 request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
 
                 if transform_request_function:
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 295c959f8a..89c96d8ca5 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -655,10 +655,24 @@ async def add_requests(
             wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning.
             wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added.
         """
+        allowed_requests = []
+        skipped = []
+
+        for request in requests:
+            check_url = request.url if isinstance(request, Request) else request
+            if await self._is_allowed_based_on_robots_txt_file(check_url):
+                allowed_requests.append(request)
+            else:
+                skipped.append(request)
+
+        if skipped:
+            # add processing with on_skiped_request callback or handler?
+            self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')
+
         request_manager = await self.get_request_manager()
 
         await request_manager.add_requests_batched(
-            requests=requests,
+            requests=allowed_requests,
             batch_size=batch_size,
             wait_time_between_batches=wait_time_between_batches,
             wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added,
@@ -1090,6 +1104,21 @@ async def __run_task_function(self) -> None:
         if request is None:
             return
 
+        if not (await self._is_allowed_based_on_robots_txt_file(request.url)):
+            self._logger.warning(
+                f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
+            )
+            await wait_for(
+                lambda: request_manager.mark_request_as_handled(request),
+                timeout=self._internal_timeout,
+                timeout_message='Marking request as handled timed out after '
+                f'{self._internal_timeout.total_seconds()} seconds',
+                logger=self._logger,
+                max_retries=3,
+            )
+            # add processing with on_skiped_request callback or handler?
+            return
+
         if request.session_id:
             session = await self._get_session_by_id(request.session_id)
         else:
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
index 468ce01e02..1f2920eb57 100644
--- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py
+++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -290,6 +290,8 @@ async def extract_links(
 
             elements = await context.page.query_selector_all(selector)
 
+            robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
+
             for element in elements:
                 url = await element.get_attribute('href')
 
@@ -300,6 +302,11 @@ async def extract_links(
                         base_url = context.request.loaded_url or context.request.url
                         url = convert_to_absolute_url(base_url, url)
 
+                    if robots_txt_file and not robots_txt_file.is_allowed(url):
+                        # add processing with on_skiped_request callback or handler?
+                        context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.')
+                        continue
+
                     request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
 
                     if transform_request_function:

From 33be1c843bd0e5f9a65388e35f6bbf57dfd3ee28 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 19:31:15 +0000
Subject: [PATCH 03/20] update load

---
 src/crawlee/_utils/robots.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index d04ae83a0c..076bf31202 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -43,6 +43,8 @@ async def load(
             response = await http_client.send_request(url, proxy_info=proxy_info)
             if is_status_code_client_error(response.status_code):
                 robots.allow_all = True  # type: ignore[attr-defined] # allow_all is a valid RobotFileParser
+            body = response.read()
+            robots.parse(body.decode('utf-8').splitlines())
         return RobotsTxtFile(robots)
 
     def is_allowed(self, url: str, user_agent: str = '*') -> bool:

From a44dff13e941909002cc07675f1338f809bfb627 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 22:12:48 +0000
Subject: [PATCH 04/20] change `RobotFileParser` to `Protego`

---
 pyproject.toml                                |  1 +
 src/crawlee/_utils/robots.py                  | 44 ++++++++-----------
 src/crawlee/crawlers/_basic/_basic_crawler.py |  2 +-
 uv.lock                                       | 11 +++++
 4 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 96db0b8cd0..c87a9dc4b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
     "eval-type-backport>=0.2.0",
     "httpx[brotli,http2,zstd]>=0.27.0",
     "more-itertools>=10.2.0",
+    "protego>=0.4.0",
     "psutil>=6.0.0",
     "pydantic-settings>=2.2.0,<2.7.0",
     "pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2",
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index 076bf31202..a3ccdd3810 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -1,9 +1,8 @@
 from __future__ import annotations
 
-from asyncio import to_thread
 from typing import TYPE_CHECKING
-from urllib.robotparser import RobotFileParser
 
+from protego import Protego  # type: ignore[import-untyped]
 from yarl import URL
 
 from crawlee._utils.web import is_status_code_client_error
@@ -14,46 +13,41 @@
 
 
 class RobotsTxtFile:
-    def __init__(self, robots: RobotFileParser) -> None:
+    def __init__(self, url: str, robots: Protego) -> None:
         self._robots = robots
+        self._original_url = URL(url).origin()
 
     @staticmethod
     async def from_content(url: str, content: str) -> RobotsTxtFile:
-        robots = RobotFileParser(url=url)
-        robots.parse(content.splitlines())
-        return RobotsTxtFile(robots)
+        robots = Protego.parse(content)
+        return RobotsTxtFile(url, robots)
 
     @staticmethod
-    async def find(
-        url: str, proxy_info: ProxyInfo | None = None, http_client: HttpClient | None = None
-    ) -> RobotsTxtFile:
+    async def find(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile:
         """Find the robots.txt file for a given URL."""
         robots_url = URL(url).with_path('/robots.txt')
-        return await RobotsTxtFile.load(str(robots_url), proxy_info, http_client)
+        return await RobotsTxtFile.load(str(robots_url), http_client, proxy_info)
 
     @staticmethod
-    async def load(
-        url: str, proxy_info: ProxyInfo | None = None, http_client: HttpClient | None = None
-    ) -> RobotsTxtFile:
+    async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile:
         """Load the robots.txt file for a given URL."""
-        robots = RobotFileParser(url=url)
-        if http_client is None:
-            await to_thread(robots.read)
-        else:
-            response = await http_client.send_request(url, proxy_info=proxy_info)
-            if is_status_code_client_error(response.status_code):
-                robots.allow_all = True  # type: ignore[attr-defined] # allow_all is a valid RobotFileParser
-            body = response.read()
-            robots.parse(body.decode('utf-8').splitlines())
-        return RobotsTxtFile(robots)
+        response = await http_client.send_request(url, proxy_info=proxy_info)
+        body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
+
+        robots = Protego.parse(body.decode('utf-8'))
+
+        return RobotsTxtFile(url, robots)
 
     def is_allowed(self, url: str, user_agent: str = '*') -> bool:
         """Check if the given URL is allowed for the given user agent."""
-        return self._robots.can_fetch(user_agent, url)
+        check_url = URL(url)
+        if check_url.origin() != self._original_url:
+            return True
+        return bool(self._robots.can_fetch(str(check_url), user_agent))
 
     def get_sitemaps(self) -> list[str]:
         """Get the list of sitemaps from the robots.txt file."""
-        return self._robots.site_maps() or []
+        return list(self._robots.sitemaps)
 
     def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         """Get the crawl delay for the given user agent."""
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 89c96d8ca5..03fb78bb54 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -1320,6 +1320,6 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
         robots_txt_file = self.robots_txt_file_cache[origin_url]
         if robots_txt_file:
             return robots_txt_file
-        robots_txt_file = await RobotsTxtFile.find(url, None, self._http_client)
+        robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
         self.robots_txt_file_cache[origin_url] = robots_txt_file
         return robots_txt_file
diff --git a/uv.lock b/uv.lock
index c28568f5ae..a29bda7509 100644
--- a/uv.lock
+++ b/uv.lock
@@ -610,6 +610,7 @@ dependencies = [
     { name = "eval-type-backport" },
     { name = "httpx", extra = ["brotli", "http2", "zstd"] },
     { name = "more-itertools" },
+    { name = "protego" },
     { name = "psutil" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
@@ -711,6 +712,7 @@ requires-dist = [
     { name = "playwright", marker = "extra == 'adaptive-crawler'", specifier = ">=1.27.0" },
     { name = "playwright", marker = "extra == 'all'", specifier = ">=1.27.0" },
     { name = "playwright", marker = "extra == 'playwright'", specifier = ">=1.27.0" },
+    { name = "protego", specifier = ">=0.4.0" },
     { name = "psutil", specifier = ">=6.0.0" },
     { name = "pydantic", specifier = ">=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2" },
     { name = "pydantic-settings", specifier = ">=2.2.0,<2.7.0" },
@@ -1938,6 +1940,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b8/d3/c3cb8f1d6ae3b37f83e1de806713a9b3642c5895f0215a62e1a4bd6e5e34/propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40", size = 12376 },
 ]
 
+[[package]]
+name = "protego"
+version = "0.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/4e/6b/84e878d0567dfc11538bad6ce2595cee7ae0c47cf6bf7293683c9ec78ef8/protego-0.4.0.tar.gz", hash = "sha256:93a5e662b61399a0e1f208a324f2c6ea95b23ee39e6cbf2c96246da4a656c2f6", size = 3246425 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/fd/8d84d75832b0983cecf3aff7ae48362fe96fc8ab6ebca9dcf3cefd87e79c/Protego-0.4.0-py2.py3-none-any.whl", hash = "sha256:37640bc0ebe37572d624453a21381d05e9d86e44f89ff1e81794d185a0491666", size = 8553 },
+]
+
 [[package]]
 name = "proxy-py"
 version = "2.4.10"

From 538672e386af059cd3492dfd92f733664a6da985 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 22:13:15 +0000
Subject: [PATCH 05/20] add tests

---
 tests/unit/_utils/test_robots.py | 49 ++++++++++++++++++++++++++++++++
 tests/unit/server.py             | 24 ++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 tests/unit/_utils/test_robots.py

diff --git a/tests/unit/_utils/test_robots.py b/tests/unit/_utils/test_robots.py
new file mode 100644
index 0000000000..3e410b503c
--- /dev/null
+++ b/tests/unit/_utils/test_robots.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from crawlee._utils.robots import RobotsTxtFile
+
+if TYPE_CHECKING:
+    from yarl import URL
+
+    from crawlee.http_clients._base import HttpClient
+
+
+async def test_generation_robots_txt_url(server_url: URL, http_client: HttpClient) -> None:
+    robots_file = await RobotsTxtFile.find(str(server_url), http_client)
+    assert len(robots_file.get_sitemaps()) > 0
+
+
+async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClient) -> None:
+    robots = await RobotsTxtFile.find(str(server_url), http_client)
+    assert robots.is_allowed('https://crawlee.dev')
+    assert robots.is_allowed(str(server_url / 'something/page.html'))
+    assert robots.is_allowed(str(server_url / 'deny_googlebot/page.html'))
+    assert not robots.is_allowed(str(server_url / 'deny_all/page.html'))
+
+
+async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None:
+    robots = await RobotsTxtFile.find(str(server_url), http_client)
+    assert len(robots.get_sitemaps()) == 2
+    assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'}
+
+
+async def test_parse_from_context() -> None:
+    content = """User-agent: *
+        Disallow: *deny_all/
+        crawl-delay: 10
+        User-agent: Googlebot
+        Disallow: *deny_googlebot/"""
+    robots = await RobotsTxtFile.from_content('http://not-exists.com/robots.txt', content)
+    assert robots.is_allowed('http://not-exists.com/something/page.html')
+    assert robots.is_allowed('http://not-exists.com/deny_googlebot/page.html')
+    assert not robots.is_allowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot')
+    assert not robots.is_allowed('http://not-exists.com/deny_all/page.html')
+
+
+async def test_bind_robots_txt_url() -> None:
+    content = 'User-agent: *\nDisallow: /'
+    robots = await RobotsTxtFile.from_content('http://check.com/robots.txt', content)
+    assert not robots.is_allowed('http://check.com/test.html')
+    assert robots.is_allowed('http://othercheck.com/robots.txt')
diff --git a/tests/unit/server.py b/tests/unit/server.py
index 29e789d013..c180bc3fc0 100644
--- a/tests/unit/server.py
+++ b/tests/unit/server.py
@@ -120,6 +120,8 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
         await hello_world_json(send)
     elif path.startswith('/xml'):
         await hello_world_xml(send)
+    elif path.startswith('/robots.txt'):
+        await robots_txt(send)
     else:
         await hello_world(send)
 
@@ -366,6 +368,28 @@ async def dynamic_content(scope: dict[str, Any], send: Send) -> None:
     await send_html_response(send, html_content=content.encode())
 
 
+async def robots_txt(send: Send) -> None:
+    """Handle requests for the robots.txt file."""
+    body = b'\n'.join(
+        [
+            b'User-agent: *',
+            b'Disallow: *deny_all/',
+            b'crawl-delay: 10',
+            b'',
+            b'User-agent: Googlebot',
+            b'Disallow: *deny_googlebot/',
+            b'crawl-delay: 1',
+            b'',
+            b'user-agent: Mozilla',
+            b'crawl-delay: 2',
+            b'',
+            b'sitemap: http://not-exists.com/sitemap_1.xml',
+            b'sitemap: http://not-exists.com/sitemap_2.xml',
+        ]
+    )
+    await send_html_response(send, body)
+
+
 class TestServer(Server):
     """A test HTTP server implementation based on Uvicorn Server."""
 

From b9b35be035a440824e38a099f87237c48de3d990 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 22:51:14 +0000
Subject: [PATCH 06/20] fix

---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 03fb78bb54..5ad104f57d 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -380,7 +380,7 @@ def __init__(
         self._additional_context_managers = _additional_context_managers or []
 
         # Internal, not explicitly configurable components
-        self.robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
+        self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
         self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
         self._snapshotter = Snapshotter.from_config(config)
         self._autoscaled_pool = AutoscaledPool(
@@ -1317,9 +1317,9 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
         if not self._respect_robots_txt_file:
             return None
         origin_url = str(URL(url).origin())
-        robots_txt_file = self.robots_txt_file_cache[origin_url]
+        robots_txt_file = self._robots_txt_file_cache.get(origin_url)
         if robots_txt_file:
             return robots_txt_file
         robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
-        self.robots_txt_file_cache[origin_url] = robots_txt_file
+        self._robots_txt_file_cache[origin_url] = robots_txt_file
         return robots_txt_file

From a49ab66d4e33427df7887870b41453cf4584e75a Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 22:51:48 +0000
Subject: [PATCH 07/20] update tests

---
 .../test_beautifulsoup_crawler.py             | 18 ++++++++++++
 .../crawlers/_parsel/test_parsel_crawler.py   | 18 ++++++++++++
 .../_playwright/test_playwright_crawler.py    | 18 ++++++++++++
 tests/unit/server.py                          | 28 ++++++-------------
 tests/unit/server_endpoints.py                | 17 +++++++++++
 5 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
index 167391dc6f..b73ea4aeaa 100644
--- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
+++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
@@ -142,3 +142,21 @@ async def test_handle_blocked_request(server_url: URL, http_client: HttpClient)
 
 def test_default_logger() -> None:
     assert BeautifulSoupCrawler().log.name == 'BeautifulSoupCrawler'
+
+
+async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None:
+    crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
+    visit = mock.Mock()
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        visit(context.request.url)
+        await context.enqueue_links()
+
+    await crawler.run([str(server_url / 'start_enqueue')])
+    visited = {call[0][0] for call in visit.call_args_list}
+
+    assert visited == {
+        str(server_url / 'start_enqueue'),
+        str(server_url / 'sub_index'),
+    }
diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
index 477c091050..586962eac7 100644
--- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py
+++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
@@ -239,3 +239,21 @@ async def request_handler(context: ParselCrawlingContext) -> None:
 
 def test_default_logger() -> None:
     assert ParselCrawler().log.name == 'ParselCrawler'
+
+
+async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None:
+    crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
+    visit = mock.Mock()
+
+    @crawler.router.default_handler
+    async def request_handler(context: ParselCrawlingContext) -> None:
+        visit(context.request.url)
+        await context.enqueue_links()
+
+    await crawler.run([str(server_url / 'start_enqueue')])
+    visited = {call[0][0] for call in visit.call_args_list}
+
+    assert visited == {
+        str(server_url / 'start_enqueue'),
+        str(server_url / 'sub_index'),
+    }
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
index 9123c30904..ec727d3ad8 100644
--- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py
+++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -563,3 +563,21 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
     assert crawler.statistics.error_tracker.total == 3 * max_retries
     assert crawler.statistics.error_tracker.unique_error_count == 2
     assert len(kvs_content) == 4
+
+
+async def test_respect_robots_txt(server_url: URL) -> None:
+    crawler = PlaywrightCrawler(respect_robots_txt_file=True)
+    visit = mock.Mock()
+
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        visit(context.request.url)
+        await context.enqueue_links()
+
+    await crawler.run([str(server_url / 'start_enqueue')])
+    visited = {call[0][0] for call in visit.call_args_list}
+
+    assert visited == {
+        str(server_url / 'start_enqueue'),
+        str(server_url / 'sub_index'),
+    }
diff --git a/tests/unit/server.py b/tests/unit/server.py
index c180bc3fc0..21ba01cec8 100644
--- a/tests/unit/server.py
+++ b/tests/unit/server.py
@@ -11,7 +11,14 @@
 from uvicorn.server import Server
 from yarl import URL
 
-from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD, INCAPSULA, SECONDARY_INDEX, START_ENQUEUE
+from tests.unit.server_endpoints import (
+    GENERIC_RESPONSE,
+    HELLO_WORLD,
+    INCAPSULA,
+    ROBOTS_TXT,
+    SECONDARY_INDEX,
+    START_ENQUEUE,
+)
 
 if TYPE_CHECKING:
     from socket import socket
@@ -370,24 +377,7 @@ async def dynamic_content(scope: dict[str, Any], send: Send) -> None:
 
 async def robots_txt(send: Send) -> None:
     """Handle requests for the robots.txt file."""
-    body = b'\n'.join(
-        [
-            b'User-agent: *',
-            b'Disallow: *deny_all/',
-            b'crawl-delay: 10',
-            b'',
-            b'User-agent: Googlebot',
-            b'Disallow: *deny_googlebot/',
-            b'crawl-delay: 1',
-            b'',
-            b'user-agent: Mozilla',
-            b'crawl-delay: 2',
-            b'',
-            b'sitemap: http://not-exists.com/sitemap_1.xml',
-            b'sitemap: http://not-exists.com/sitemap_2.xml',
-        ]
-    )
-    await send_html_response(send, body)
+    await send_html_response(send, ROBOTS_TXT)
 
 
 class TestServer(Server):
diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py
index 00456d3dcd..a9f48e6e47 100644
--- a/tests/unit/server_endpoints.py
+++ b/tests/unit/server_endpoints.py
@@ -41,3 +41,20 @@
 <body>
     Insightful content
 </body></html>"""
+
+
+ROBOTS_TXT = b"""\
+User-agent: *
+Disallow: *deny_all/
+Disallow: /page_
+crawl-delay: 10
+
+User-agent: Googlebot
+Disallow: *deny_googlebot/
+crawl-delay: 1
+
+user-agent: Mozilla
+crawl-delay: 2
+
+sitemap: http://not-exists.com/sitemap_1.xml
+sitemap: http://not-exists.com/sitemap_2.xml"""

From 46a2356b6f78fa54d27af57453e6c5b4bfd3680a Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 23:02:50 +0000
Subject: [PATCH 08/20] update TODO comments

---
 .../crawlers/_abstract_http/_abstract_http_crawler.py       | 3 ++-
 src/crawlee/crawlers/_basic/_basic_crawler.py               | 6 ++++--
 src/crawlee/crawlers/_playwright/_playwright_crawler.py     | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
index 1ca3600542..664fe81ba8 100644
--- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
+++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -168,7 +168,8 @@ async def extract_links(
                     url = convert_to_absolute_url(base_url, url)
 
                 if robots_txt_file and not robots_txt_file.is_allowed(url):
-                    # add processing with on_skiped_request callback or handler?
+                    # TODO: https://github.com/apify/crawlee-python/issues/1160
+                    # add processing with on_skiped_request hook
                     context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.')
                     continue
 
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 5ad104f57d..d1631da1ad 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -666,7 +666,8 @@ async def add_requests(
                 skipped.append(request)
 
         if skipped:
-            # add processing with on_skiped_request callback or handler?
+            # TODO: https://github.com/apify/crawlee-python/issues/1160
+            # add processing with on_skiped_request hook
             self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')
 
         request_manager = await self.get_request_manager()
@@ -1116,7 +1117,8 @@ async def __run_task_function(self) -> None:
                 logger=self._logger,
                 max_retries=3,
             )
-            # add processing with on_skiped_request callback or handler?
+            # TODO: https://github.com/apify/crawlee-python/issues/1160
+            # add processing with on_skiped_request hook
             return
 
         if request.session_id:
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
index 1f2920eb57..7dff289142 100644
--- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py
+++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -303,7 +303,8 @@ async def extract_links(
                         url = convert_to_absolute_url(base_url, url)
 
                     if robots_txt_file and not robots_txt_file.is_allowed(url):
-                        # add processing with on_skiped_request callback or handler?
+                        # TODO: https://github.com/apify/crawlee-python/issues/1160
+                        # add processing with on_skiped_request hook
                         context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.')
                         continue
 

From 10077b636df40cea91250f37dcbf2a057e00f2ec Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 23:23:13 +0000
Subject: [PATCH 09/20] update docstrings

---
 src/crawlee/_utils/robots.py                  | 37 ++++++++++++++++---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 15 ++++++--
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index a3ccdd3810..70a3458b94 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -19,18 +19,36 @@ def __init__(self, url: str, robots: Protego) -> None:
 
     @staticmethod
     async def from_content(url: str, content: str) -> RobotsTxtFile:
+        """Create a RobotsTxtFile instance from the given content.
+
+        Args:
+            url: the URL of the robots.txt file
+            content: the content of the robots.txt file
+        """
         robots = Protego.parse(content)
         return RobotsTxtFile(url, robots)
 
     @staticmethod
     async def find(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile:
-        """Find the robots.txt file for a given URL."""
+        """Determine the location of a robots.txt file for a URL and fetch it.
+
+        Args:
+            url: the URL to fetch robots.txt for
+            proxy_info: a `ProxyInfo` to be used for fetching the robots.txt file
+            http_client: the HTTP client to use for fetching the robots.txt file
+        """
         robots_url = URL(url).with_path('/robots.txt')
         return await RobotsTxtFile.load(str(robots_url), http_client, proxy_info)
 
     @staticmethod
     async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile:
-        """Load the robots.txt file for a given URL."""
+        """Load the robots.txt file for a given URL.
+
+        Args:
+            url: the URL to fetch robots.txt for
+            proxy_info: a `ProxyInfo` to be used for fetching the robots.txt file
+            http_client: the HTTP client to use for fetching the robots.txt file
+        """
         response = await http_client.send_request(url, proxy_info=proxy_info)
         body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
 
@@ -39,17 +57,26 @@ async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None =
         return RobotsTxtFile(url, robots)
 
     def is_allowed(self, url: str, user_agent: str = '*') -> bool:
-        """Check if the given URL is allowed for the given user agent."""
+        """Check if the given URL is allowed for the given user agent.
+
+        Args:
+            url: the URL to check
+            user_agent: the user agent to check for
+        """
         check_url = URL(url)
         if check_url.origin() != self._original_url:
             return True
         return bool(self._robots.can_fetch(str(check_url), user_agent))
 
     def get_sitemaps(self) -> list[str]:
-        """Get the list of sitemaps from the robots.txt file."""
+        """Get the list of sitemaps urls from the robots.txt file."""
         return list(self._robots.sitemaps)
 
     def get_crawl_delay(self, user_agent: str = '*') -> int | None:
-        """Get the crawl delay for the given user agent."""
+        """Get the crawl delay for the given user agent.
+
+        Args:
+            user_agent: the user-agent to check for
+        """
         crawl_delay = self._robots.crawl_delay(user_agent)
         return int(crawl_delay) if crawl_delay is not None else None
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index d1631da1ad..3019425d3c 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -162,7 +162,8 @@ class _BasicCrawlerOptions(TypedDict):
     subclasses rather than direct instantiation of `BasicCrawler`."""
 
     respect_robots_txt_file: NotRequired[bool]
-    """"""
+    """If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain,
+    and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""
 
 
 class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict):
@@ -1308,14 +1309,22 @@ def _check_request_collision(self, request: Request, session: Session | None) ->
             )
 
     async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
-        """Check if the URL is allowed based on the robots.txt file."""
+        """Check if the URL is allowed based on the robots.txt file.
+
+        Args:
+            url: The URL to check.
+        """
         if not self._respect_robots_txt_file:
             return True
         robots_txt_file = await self._get_robots_txt_file_for_url(url)
         return not robots_txt_file or robots_txt_file.is_allowed(url)
 
     async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
-        """Get the RobotsTxtFile for a given URL."""
+        """Get the RobotsTxtFile for a given URL.
+
+        Args:
+            url: URL on the basis of which the RobotsTxtFile will be obtained.
+        """
         if not self._respect_robots_txt_file:
             return None
         origin_url = str(URL(url).origin())

From b3e97890a4515ffe58a2e6da4611fe9debceea79 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com>
Date: Fri, 18 Apr 2025 02:44:43 +0300
Subject: [PATCH 10/20] Apply suggestions from code review

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py | 3 +--
 src/crawlee/crawlers/_basic/_basic_crawler.py                 | 2 +-
 src/crawlee/crawlers/_playwright/_playwright_crawler.py       | 3 +--
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
index 664fe81ba8..9abcb4c6f5 100644
--- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
+++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -169,8 +169,7 @@ async def extract_links(
 
                 if robots_txt_file and not robots_txt_file.is_allowed(url):
                     # TODO: https://github.com/apify/crawlee-python/issues/1160
-                    # add processing with on_skiped_request hook
-                    context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.')
+                    # add processing with on_skipped_request hook
                     continue
 
                 request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 3019425d3c..9851b1771b 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -668,7 +668,7 @@ async def add_requests(
 
         if skipped:
             # TODO: https://github.com/apify/crawlee-python/issues/1160
-            # add processing with on_skiped_request hook
+            # add processing with on_skipped_request hook
             self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')
 
         request_manager = await self.get_request_manager()
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
index 7dff289142..8055ebc5e7 100644
--- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py
+++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -304,8 +304,7 @@ async def extract_links(
 
                     if robots_txt_file and not robots_txt_file.is_allowed(url):
                         # TODO: https://github.com/apify/crawlee-python/issues/1160
-                        # add processing with on_skiped_request hook
-                        context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.')
+                        # add processing with on_skipped_request hook
                         continue
 
                     request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})

From 4f4529e1bec25b89a8f0f8baf0c996cc8c2b7e34 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com>
Date: Wed, 23 Apr 2025 14:40:26 +0300
Subject: [PATCH 11/20] Update src/crawlee/crawlers/_basic/_basic_crawler.py

Co-authored-by: Jan Buchar <Teyras@gmail.com>
---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 9851b1771b..f4fae3cc84 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -162,7 +162,7 @@ class _BasicCrawlerOptions(TypedDict):
     subclasses rather than direct instantiation of `BasicCrawler`."""
 
     respect_robots_txt_file: NotRequired[bool]
-    """If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain,
+    """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain,
     and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`."""
 
 

From 8973618554328864e34a5a26860e0a8a71d44daf Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 17 Apr 2025 23:27:18 +0000
Subject: [PATCH 12/20] fix docstrings

---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index f4fae3cc84..60f2fdbe61 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -288,7 +288,9 @@ def __init__(
             configure_logging: If True, the crawler will set up logging infrastructure automatically.
             statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
                 outputs statistics as plain text log messages.
-            respect_robots_txt_file: If True, the crawler will respect the robots.txt file of the target website.
+            respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
+                for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added
+                via `EnqueueLinksFunction`
             _context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
                 Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
             _additional_context_managers: Additional context managers used throughout the crawler lifecycle.

From 73a7bc6f2a90b000a6c5774eb629b18b219c0c99 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 23 Apr 2025 11:48:45 +0000
Subject: [PATCH 13/20] change staticmethod to classmethod

---
 src/crawlee/_utils/robots.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index 70a3458b94..f83666d66c 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -8,6 +8,8 @@
 from crawlee._utils.web import is_status_code_client_error
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from crawlee.http_clients import HttpClient
     from crawlee.proxy_configuration import ProxyInfo
 
@@ -17,8 +19,8 @@ def __init__(self, url: str, robots: Protego) -> None:
         self._robots = robots
         self._original_url = URL(url).origin()
 
-    @staticmethod
-    async def from_content(url: str, content: str) -> RobotsTxtFile:
+    @classmethod
+    async def from_content(cls, url: str, content: str) -> Self:
         """Create a RobotsTxtFile instance from the given content.
 
         Args:
@@ -26,10 +28,10 @@ async def from_content(url: str, content: str) -> RobotsTxtFile:
             content: the content of the robots.txt file
         """
         robots = Protego.parse(content)
-        return RobotsTxtFile(url, robots)
+        return cls(url, robots)
 
-    @staticmethod
-    async def find(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile:
+    @classmethod
+    async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
         """Determine the location of a robots.txt file for a URL and fetch it.
 
         Args:
@@ -38,10 +40,10 @@ async def find(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None =
             http_client: the HTTP client to use for fetching the robots.txt file
         """
         robots_url = URL(url).with_path('/robots.txt')
-        return await RobotsTxtFile.load(str(robots_url), http_client, proxy_info)
+        return await cls.load(str(robots_url), http_client, proxy_info)
 
-    @staticmethod
-    async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile:
+    @classmethod
+    async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self:
         """Load the robots.txt file for a given URL.
 
         Args:
@@ -54,7 +56,7 @@ async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None =
 
         robots = Protego.parse(body.decode('utf-8'))
 
-        return RobotsTxtFile(url, robots)
+        return cls(url, robots)
 
     def is_allowed(self, url: str, user_agent: str = '*') -> bool:
         """Check if the given URL is allowed for the given user agent.

From 8039fb5aaf6a8940e20323fc669c373c60a1fed1 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com>
Date: Wed, 23 Apr 2025 22:40:52 +0300
Subject: [PATCH 14/20] Update src/crawlee/_utils/robots.py

Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
---
 src/crawlee/_utils/robots.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index f83666d66c..41743a756b 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -21,7 +21,7 @@ def __init__(self, url: str, robots: Protego) -> None:
 
     @classmethod
     async def from_content(cls, url: str, content: str) -> Self:
-        """Create a RobotsTxtFile instance from the given content.
+        """Create a `RobotsTxtFile` instance from the given content.
 
         Args:
             url: the URL of the robots.txt file

From 125804c513a2b4123371a9e85d79b81580ce444d Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 23 Apr 2025 20:59:58 +0000
Subject: [PATCH 15/20] add _robots_txt_locks_cache

---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 20 ++++++++++++++++---
 tests/unit/_utils/test_robots.py              |  2 +-
 .../crawlers/_basic/test_basic_crawler.py     | 15 +++++++++++++-
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 60f2fdbe61..9833015d37 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -384,6 +384,7 @@ def __init__(
 
         # Internal, not explicitly configurable components
         self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
+        self._robots_txt_locks_cache: LRUCache[str, asyncio.Lock] = LRUCache(maxsize=100)
         self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
         self._snapshotter = Snapshotter.from_config(config)
         self._autoscaled_pool = AutoscaledPool(
@@ -1333,6 +1334,19 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
         robots_txt_file = self._robots_txt_file_cache.get(origin_url)
         if robots_txt_file:
             return robots_txt_file
-        robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
-        self._robots_txt_file_cache[origin_url] = robots_txt_file
-        return robots_txt_file
+
+        txt_file_lock = self._robots_txt_locks_cache.get(origin_url)
+        if txt_file_lock is None:
+            txt_file_lock = asyncio.Lock()
+            self._robots_txt_locks_cache[origin_url] = txt_file_lock
+
+        async with txt_file_lock:
+            # Check again if the robots.txt file is already cached after acquiring the lock
+            robots_txt_file = self._robots_txt_file_cache.get(origin_url)
+            if robots_txt_file:
+                return robots_txt_file
+
+            # If not cached, fetch the robots.txt file
+            robots_txt_file = await RobotsTxtFile.find(url, self._http_client)
+            self._robots_txt_file_cache[origin_url] = robots_txt_file
+            return robots_txt_file
diff --git a/tests/unit/_utils/test_robots.py b/tests/unit/_utils/test_robots.py
index 3e410b503c..61dc60daa5 100644
--- a/tests/unit/_utils/test_robots.py
+++ b/tests/unit/_utils/test_robots.py
@@ -29,7 +29,7 @@ async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -
     assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'}
 
 
-async def test_parse_from_context() -> None:
+async def test_parse_from_content() -> None:
     content = """User-agent: *
         Disallow: *deny_all/
         crawl-delay: 10
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index e0e2c84f60..f7bf678fd2 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -12,13 +12,14 @@
 from datetime import timedelta
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, cast
-from unittest.mock import AsyncMock, Mock, call
+from unittest.mock import AsyncMock, Mock, call, patch
 
 import pytest
 
 from crawlee import ConcurrencySettings, Glob, service_locator
 from crawlee._request import Request
 from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders
+from crawlee._utils.robots import RobotsTxtFile
 from crawlee.configuration import Configuration
 from crawlee.crawlers import BasicCrawler
 from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError
@@ -1291,3 +1292,15 @@ async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> Non
     await crawler.run(requests)
 
     assert error_handler_mock.call_count == 1
+
+
+async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None:
+    crawler = BasicCrawler(respect_robots_txt_file=True)
+
+    with patch('crawlee.crawlers._basic._basic_crawler.RobotsTxtFile.find', wraps=RobotsTxtFile.find) as spy:
+        await asyncio.gather(
+            *[asyncio.create_task(crawler._get_robots_txt_file_for_url(str(server_url))) for _ in range(10)]
+        )
+
+        # Check that the lock was acquired only once
+        assert spy.call_count == 1

From 4b7346b715a9c1837e1f731b65e657192d9f9d6e Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 23 Apr 2025 21:21:46 +0000
Subject: [PATCH 16/20] update `pyproject.toml`

---
 pyproject.toml                                               | 2 ++
 src/crawlee/_utils/robots.py                                 | 2 +-
 src/crawlee/storage_clients/_memory/_request_queue_client.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c87a9dc4b9..b014038131 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -237,7 +237,9 @@ module = [
     "functions_framework",          # Example code shows deploy on Google Cloud.
     "jaro",                         # Untyped and stubs not available
     "loguru",                       # Example code shows integration of loguru and crawlee for JSON logging.
+    "protego",                      # Untyped and stubs not available
     "sklearn.linear_model",         # Untyped and stubs not available
+    "sortedcollections",            # Untyped and stubs not available
     "cookiecutter.*",               # Untyped and stubs not available
     "inquirer.*",                   # Untyped and stubs not available
 ]
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index 41743a756b..01349b49ff 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -2,7 +2,7 @@
 
 from typing import TYPE_CHECKING
 
-from protego import Protego  # type: ignore[import-untyped]
+from protego import Protego
 from yarl import URL
 
 from crawlee._utils.web import is_status_code_client_error
diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py
index 0031e54abd..477d53df07 100644
--- a/src/crawlee/storage_clients/_memory/_request_queue_client.py
+++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py
@@ -8,7 +8,7 @@
 from logging import getLogger
 from typing import TYPE_CHECKING
 
-from sortedcollections import ValueSortedDict  # type: ignore[import-untyped]
+from sortedcollections import ValueSortedDict
 from typing_extensions import override
 
 from crawlee._types import StorageTypes

From e6099edf0f98eb6750aeae5f52431a495a2435bb Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 23 Apr 2025 22:00:21 +0000
Subject: [PATCH 17/20] update docstrings

---
 src/crawlee/_utils/robots.py                  | 23 ++++++++++---------
 src/crawlee/crawlers/_basic/_basic_crawler.py |  4 ++--
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
index 01349b49ff..930ae09431 100644
--- a/src/crawlee/_utils/robots.py
+++ b/src/crawlee/_utils/robots.py
@@ -24,8 +24,8 @@ async def from_content(cls, url: str, content: str) -> Self:
         """Create a `RobotsTxtFile` instance from the given content.
 
         Args:
-            url: the URL of the robots.txt file
-            content: the content of the robots.txt file
+            url: The URL associated with the robots.txt file.
+            content: The raw string content of the robots.txt file to be parsed.
         """
         robots = Protego.parse(content)
         return cls(url, robots)
@@ -35,9 +35,9 @@ async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
         """Determine the location of a robots.txt file for a URL and fetch it.
 
         Args:
-            url: the URL to fetch robots.txt for
-            proxy_info: a `ProxyInfo` to be used for fetching the robots.txt file
-            http_client: the HTTP client to use for fetching the robots.txt file
+            url: The URL whose domain will be used to find the corresponding robots.txt file.
+            http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
+            proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
         """
         robots_url = URL(url).with_path('/robots.txt')
         return await cls.load(str(robots_url), http_client, proxy_info)
@@ -47,9 +47,9 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
         """Load the robots.txt file for a given URL.
 
         Args:
-            url: the URL to fetch robots.txt for
-            proxy_info: a `ProxyInfo` to be used for fetching the robots.txt file
-            http_client: the HTTP client to use for fetching the robots.txt file
+            url: The direct URL of the robots.txt file to be loaded.
+            http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file.
+            proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
         """
         response = await http_client.send_request(url, proxy_info=proxy_info)
         body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
@@ -62,8 +62,8 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool:
         """Check if the given URL is allowed for the given user agent.
 
         Args:
-            url: the URL to check
-            user_agent: the user agent to check for
+            url: The URL to check against the robots.txt rules.
+            user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent.
         """
         check_url = URL(url)
         if check_url.origin() != self._original_url:
@@ -78,7 +78,8 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None:
         """Get the crawl delay for the given user agent.
 
         Args:
-            user_agent: the user-agent to check for
+            user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any
+                user-agent.
         """
         crawl_delay = self._robots.crawl_delay(user_agent)
         return int(crawl_delay) if crawl_delay is not None else None
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 9833015d37..e4d49c036e 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -1315,7 +1315,7 @@ async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool:
         """Check if the URL is allowed based on the robots.txt file.
 
         Args:
-            url: The URL to check.
+            url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted.
         """
         if not self._respect_robots_txt_file:
             return True
@@ -1326,7 +1326,7 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
         """Get the RobotsTxtFile for a given URL.
 
         Args:
-            url: URL on the basis of which the RobotsTxtFile will be obtained.
+            url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file.
         """
         if not self._respect_robots_txt_file:
             return None

From 41b803dfca67e4de826853eecab6f7b4f1f897e3 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 24 Apr 2025 00:48:52 +0000
Subject: [PATCH 18/20] add docs example

---
 .../code_examples/respect_robots_txt_file.py  | 27 +++++++++++++++++++
 docs/examples/respect_robots_txt_file.mdx     | 21 +++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 docs/examples/code_examples/respect_robots_txt_file.py
 create mode 100644 docs/examples/respect_robots_txt_file.mdx

diff --git a/docs/examples/code_examples/respect_robots_txt_file.py b/docs/examples/code_examples/respect_robots_txt_file.py
new file mode 100644
index 0000000000..ebd63b1c2e
--- /dev/null
+++ b/docs/examples/code_examples/respect_robots_txt_file.py
@@ -0,0 +1,27 @@
+import asyncio
+
+from crawlee.crawlers import (
+    BeautifulSoupCrawler,
+    BeautifulSoupCrawlingContext,
+)
+
+
+async def main() -> None:
+    # Initialize the crawler with robots.txt compliance enabled
+    crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)
+
+    @crawler.router.default_handler
+    async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+
+    # Start the crawler with the specified URLs
+    # The crawler will check the robots.txt file before making requests
+    # In this example, 'https://news.ycombinator.com/login' will be skipped
+    # because it's disallowed in the site's robots.txt file
+    await crawler.run(
+        ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
+    )
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/examples/respect_robots_txt_file.mdx b/docs/examples/respect_robots_txt_file.mdx
new file mode 100644
index 0000000000..5f6194c919
--- /dev/null
+++ b/docs/examples/respect_robots_txt_file.mdx
@@ -0,0 +1,21 @@
+---
+id: respect-robots-txt-file
+title: Respect robots.txt file
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+
+import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';
+
+This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.
+
+To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file.
+
+As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped.
+
+The code below demonstrates this behavior using the <ApiLink to="class/BeautifulSoupCrawler">`BeautifulSoupCrawler`</ApiLink>:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {RespectRobotsTxt}
+</RunnableCodeBlock>

From 8f25d838ff095ae38ea6dcd64accdf143e06e982 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 24 Apr 2025 01:08:33 +0000
Subject: [PATCH 19/20] update comment

---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index e4d49c036e..99e4045a9e 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -1122,7 +1122,7 @@ async def __run_task_function(self) -> None:
                 max_retries=3,
             )
             # TODO: https://github.com/apify/crawlee-python/issues/1160
-            # add processing with on_skiped_request hook
+            # add processing with on_skipped_request hook
             return
 
         if request.session_id:

From 260ad923da62f663485d63770137758285b6b494 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 24 Apr 2025 11:45:54 +0000
Subject: [PATCH 20/20] one lock to rule them all

---
 src/crawlee/crawlers/_basic/_basic_crawler.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 90800c6c4e..49b28c043e 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -384,7 +384,7 @@ def __init__(
 
         # Internal, not explicitly configurable components
         self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000)
-        self._robots_txt_locks_cache: LRUCache[str, asyncio.Lock] = LRUCache(maxsize=100)
+        self._robots_txt_lock = asyncio.Lock()
         self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
         self._snapshotter = Snapshotter.from_config(config)
         self._autoscaled_pool = AutoscaledPool(
@@ -1333,12 +1333,7 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None:
         if robots_txt_file:
             return robots_txt_file
 
-        txt_file_lock = self._robots_txt_locks_cache.get(origin_url)
-        if txt_file_lock is None:
-            txt_file_lock = asyncio.Lock()
-            self._robots_txt_locks_cache[origin_url] = txt_file_lock
-
-        async with txt_file_lock:
+        async with self._robots_txt_lock:
             # Check again if the robots.txt file is already cached after acquiring the lock
             robots_txt_file = self._robots_txt_file_cache.get(origin_url)
             if robots_txt_file: