From 427b00a4adc05b2170269128c12c9612b5553c2f Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 15:43:43 +0000 Subject: [PATCH 01/20] basic_robots_allow --- src/crawlee/_utils/robots.py | 59 +++++++++++++++++++ src/crawlee/crawlers/_basic/_basic_crawler.py | 29 +++++++++ 2 files changed, 88 insertions(+) create mode 100644 src/crawlee/_utils/robots.py diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py new file mode 100644 index 0000000000..d04ae83a0c --- /dev/null +++ b/src/crawlee/_utils/robots.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from asyncio import to_thread +from typing import TYPE_CHECKING +from urllib.robotparser import RobotFileParser + +from yarl import URL + +from crawlee._utils.web import is_status_code_client_error + +if TYPE_CHECKING: + from crawlee.http_clients import HttpClient + from crawlee.proxy_configuration import ProxyInfo + + +class RobotsTxtFile: + def __init__(self, robots: RobotFileParser) -> None: + self._robots = robots + + @staticmethod + async def from_content(url: str, content: str) -> RobotsTxtFile: + robots = RobotFileParser(url=url) + robots.parse(content.splitlines()) + return RobotsTxtFile(robots) + + @staticmethod + async def find( + url: str, proxy_info: ProxyInfo | None = None, http_client: HttpClient | None = None + ) -> RobotsTxtFile: + """Find the robots.txt file for a given URL.""" + robots_url = URL(url).with_path('/robots.txt') + return await RobotsTxtFile.load(str(robots_url), proxy_info, http_client) + + @staticmethod + async def load( + url: str, proxy_info: ProxyInfo | None = None, http_client: HttpClient | None = None + ) -> RobotsTxtFile: + """Load the robots.txt file for a given URL.""" + robots = RobotFileParser(url=url) + if http_client is None: + await to_thread(robots.read) + else: + response = await http_client.send_request(url, proxy_info=proxy_info) + if is_status_code_client_error(response.status_code): + robots.allow_all = True # type: ignore[attr-defined] # allow_all is a valid RobotFileParser + return RobotsTxtFile(robots) + + def is_allowed(self, url: str, user_agent: str = '*') -> bool: + """Check if the given URL is allowed for the given user agent.""" + return self._robots.can_fetch(user_agent, url) + + def get_sitemaps(self) -> list[str]: + """Get the list of sitemaps from the robots.txt file.""" + return self._robots.site_maps() or [] + + def get_crawl_delay(self, user_agent: str = '*') -> int | None: + """Get the crawl delay for the given user agent.""" + crawl_delay = self._robots.crawl_delay(user_agent) + return int(crawl_delay) if crawl_delay is not None else None diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index a196f5e251..295c959f8a 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -17,8 +17,10 @@ from urllib.parse import ParseResult, urlparse from weakref import WeakKeyDictionary +from cachetools import LRUCache from tldextract import TLDExtract from typing_extensions import NotRequired, TypedDict, TypeVar, Unpack, assert_never +from yarl import URL from crawlee import EnqueueStrategy, Glob, service_locator from crawlee._autoscaling import AutoscaledPool, Snapshotter, SystemStatus @@ -32,6 +34,7 @@ SendRequestFunction, ) from crawlee._utils.docs import docs_group +from crawlee._utils.robots import RobotsTxtFile from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee._utils.wait import wait_for from crawlee._utils.web import is_status_code_client_error, is_status_code_server_error @@ -158,6 +161,9 @@ class _BasicCrawlerOptions(TypedDict): """A logger instance, typically provided by a subclass, for consistent logging labels. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.""" + respect_robots_txt_file: NotRequired[bool] + """""" + class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict): """Generic options the `BasicCrawler` constructor.""" @@ -238,6 +244,7 @@ def __init__( keep_alive: bool = False, configure_logging: bool = True, statistics_log_format: Literal['table', 'inline'] = 'table', + respect_robots_txt_file: bool = False, _context_pipeline: ContextPipeline[TCrawlingContext] | None = None, _additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None, _logger: logging.Logger | None = None, @@ -280,6 +287,7 @@ def __init__( configure_logging: If True, the crawler will set up logging infrastructure automatically. statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain text log messages. + respect_robots_txt_file: If True, the crawler will respect the robots.txt file of the target website. _context_pipeline: Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. _additional_context_managers: Additional context managers used throughout the crawler lifecycle. @@ -335,6 +343,7 @@ def __init__( self._max_requests_per_crawl = max_requests_per_crawl self._max_session_rotations = max_session_rotations self._max_crawl_depth = max_crawl_depth + self._respect_robots_txt_file = respect_robots_txt_file # Timeouts self._request_handler_timeout = request_handler_timeout @@ -371,6 +380,7 @@ def __init__( self._additional_context_managers = _additional_context_managers or [] # Internal, not explicitly configurable components + self.robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000) self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name) self._snapshotter = Snapshotter.from_config(config) self._autoscaled_pool = AutoscaledPool( @@ -1265,3 +1275,22 @@ def _check_request_collision(self, request: Request, session: Session | None) -> raise RequestCollisionError( f'The Session (id: {request.session_id}) bound to the Request is no longer available in SessionPool' ) + + async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool: + """Check if the URL is allowed based on the robots.txt file.""" + if not self._respect_robots_txt_file: + return True + robots_txt_file = await self._get_robots_txt_file_for_url(url) + return not robots_txt_file or robots_txt_file.is_allowed(url) + + async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: + """Get the RobotsTxtFile for a given URL.""" + if not self._respect_robots_txt_file: + return None + origin_url = str(URL(url).origin()) + robots_txt_file = self.robots_txt_file_cache[origin_url] + if robots_txt_file: + return robots_txt_file + robots_txt_file = await RobotsTxtFile.find(url, None, self._http_client) + self.robots_txt_file_cache[origin_url] = robots_txt_file + return robots_txt_file From 638b5be245851b91fd578dacd06ebb6c9d92e826 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 18:59:27 +0000 Subject: [PATCH 02/20] add respect robots_txt_file --- .../_abstract_http/_abstract_http_crawler.py | 7 +++++ src/crawlee/crawlers/_basic/_basic_crawler.py | 31 ++++++++++++++++++- .../_playwright/_playwright_crawler.py | 7 +++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 04e16683f6..1ca3600542 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -159,12 +159,19 @@ async def extract_links( requests = list[Request]() base_user_data = user_data or {} + robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) + for link in self._parser.find_links(parsed_content, selector=selector): url = link if not is_url_absolute(url): base_url = context.request.loaded_url or context.request.url url = convert_to_absolute_url(base_url, url) + if robots_txt_file and not robots_txt_file.is_allowed(url): + # add processing with on_skiped_request callback or handler? + context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.') + continue + request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label) if transform_request_function: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 295c959f8a..89c96d8ca5 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -655,10 +655,24 @@ async def add_requests( wait_for_all_requests_to_be_added: If True, wait for all requests to be added before returning. wait_for_all_requests_to_be_added_timeout: Timeout for waiting for all requests to be added. """ + allowed_requests = [] + skipped = [] + + for request in requests: + check_url = request.url if isinstance(request, Request) else request + if await self._is_allowed_based_on_robots_txt_file(check_url): + allowed_requests.append(request) + else: + skipped.append(request) + + if skipped: + # add processing with on_skiped_request callback or handler? + self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file') + request_manager = await self.get_request_manager() await request_manager.add_requests_batched( - requests=requests, + requests=allowed_requests, batch_size=batch_size, wait_time_between_batches=wait_time_between_batches, wait_for_all_requests_to_be_added=wait_for_all_requests_to_be_added, @@ -1090,6 +1104,21 @@ async def __run_task_function(self) -> None: if request is None: return + if not (await self._is_allowed_based_on_robots_txt_file(request.url)): + self._logger.warning( + f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt' + ) + await wait_for( + lambda: request_manager.mark_request_as_handled(request), + timeout=self._internal_timeout, + timeout_message='Marking request as handled timed out after ' + f'{self._internal_timeout.total_seconds()} seconds', + logger=self._logger, + max_retries=3, + ) + # add processing with on_skiped_request callback or handler? + return + if request.session_id: session = await self._get_session_by_id(request.session_id) else: diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 468ce01e02..1f2920eb57 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -290,6 +290,8 @@ async def extract_links( elements = await context.page.query_selector_all(selector) + robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) + for element in elements: url = await element.get_attribute('href') @@ -300,6 +302,11 @@ async def extract_links( base_url = context.request.loaded_url or context.request.url url = convert_to_absolute_url(base_url, url) + if robots_txt_file and not robots_txt_file.is_allowed(url): + # add processing with on_skiped_request callback or handler? + context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.') + continue + request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label}) if transform_request_function: From 33be1c843bd0e5f9a65388e35f6bbf57dfd3ee28 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 19:31:15 +0000 Subject: [PATCH 03/20] update load --- src/crawlee/_utils/robots.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index d04ae83a0c..076bf31202 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -43,6 +43,8 @@ async def load( response = await http_client.send_request(url, proxy_info=proxy_info) if is_status_code_client_error(response.status_code): robots.allow_all = True # type: ignore[attr-defined] # allow_all is a valid RobotFileParser + body = response.read() + robots.parse(body.decode('utf-8').splitlines()) return RobotsTxtFile(robots) def is_allowed(self, url: str, user_agent: str = '*') -> bool: From a44dff13e941909002cc07675f1338f809bfb627 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 22:12:48 +0000 Subject: [PATCH 04/20] change `RobotFileParser` to `Protego` --- pyproject.toml | 1 + src/crawlee/_utils/robots.py | 44 ++++++++----------- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- uv.lock | 11 +++++ 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 96db0b8cd0..c87a9dc4b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ "eval-type-backport>=0.2.0", "httpx[brotli,http2,zstd]>=0.27.0", "more-itertools>=10.2.0", + "protego>=0.4.0", "psutil>=6.0.0", "pydantic-settings>=2.2.0,<2.7.0", "pydantic>=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2", diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index 076bf31202..a3ccdd3810 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -1,9 +1,8 @@ from __future__ import annotations -from asyncio import to_thread from typing import TYPE_CHECKING -from urllib.robotparser import RobotFileParser +from protego import Protego # type: ignore[import-untyped] from yarl import URL from crawlee._utils.web import is_status_code_client_error @@ -14,46 +13,41 @@ class RobotsTxtFile: - def __init__(self, robots: RobotFileParser) -> None: + def __init__(self, url: str, robots: Protego) -> None: self._robots = robots + self._original_url = URL(url).origin() @staticmethod async def from_content(url: str, content: str) -> RobotsTxtFile: - robots = RobotFileParser(url=url) - robots.parse(content.splitlines()) - return RobotsTxtFile(robots) + robots = Protego.parse(content) + return RobotsTxtFile(url, robots) @staticmethod - async def find( - url: str, proxy_info: ProxyInfo | None = None, http_client: HttpClient | None = None - ) -> RobotsTxtFile: + async def find(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile: """Find the robots.txt file for a given URL.""" robots_url = URL(url).with_path('/robots.txt') - return await RobotsTxtFile.load(str(robots_url), proxy_info, http_client) + return await RobotsTxtFile.load(str(robots_url), http_client, proxy_info) @staticmethod - async def load( - url: str, proxy_info: ProxyInfo | None = None, http_client: HttpClient | None = None - ) -> RobotsTxtFile: + async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile: """Load the robots.txt file for a given URL.""" - robots = RobotFileParser(url=url) - if http_client is None: - await to_thread(robots.read) - else: - response = await http_client.send_request(url, proxy_info=proxy_info) - if is_status_code_client_error(response.status_code): - robots.allow_all = True # type: ignore[attr-defined] # allow_all is a valid RobotFileParser - body = response.read() - robots.parse(body.decode('utf-8').splitlines()) - return RobotsTxtFile(robots) + response = await http_client.send_request(url, proxy_info=proxy_info) + body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read() + + robots = Protego.parse(body.decode('utf-8')) + + return RobotsTxtFile(url, robots) def is_allowed(self, url: str, user_agent: str = '*') -> bool: """Check if the given URL is allowed for the given user agent.""" - return self._robots.can_fetch(user_agent, url) + check_url = URL(url) + if check_url.origin() != self._original_url: + return True + return bool(self._robots.can_fetch(str(check_url), user_agent)) def get_sitemaps(self) -> list[str]: """Get the list of sitemaps from the robots.txt file.""" - return self._robots.site_maps() or [] + return list(self._robots.sitemaps) def get_crawl_delay(self, user_agent: str = '*') -> int | None: """Get the crawl delay for the given user agent.""" diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 89c96d8ca5..03fb78bb54 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1320,6 +1320,6 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: robots_txt_file = self.robots_txt_file_cache[origin_url] if robots_txt_file: return robots_txt_file - robots_txt_file = await RobotsTxtFile.find(url, None, self._http_client) + robots_txt_file = await RobotsTxtFile.find(url, self._http_client) self.robots_txt_file_cache[origin_url] = robots_txt_file return robots_txt_file diff --git a/uv.lock b/uv.lock index c28568f5ae..a29bda7509 100644 --- a/uv.lock +++ b/uv.lock @@ -610,6 +610,7 @@ dependencies = [ { name = "eval-type-backport" }, { name = "httpx", extra = ["brotli", "http2", "zstd"] }, { name = "more-itertools" }, + { name = "protego" }, { name = "psutil" }, { name = "pydantic" }, { name = "pydantic-settings" }, @@ -711,6 +712,7 @@ requires-dist = [ { name = "playwright", marker = "extra == 'adaptive-crawler'", specifier = ">=1.27.0" }, { name = "playwright", marker = "extra == 'all'", specifier = ">=1.27.0" }, { name = "playwright", marker = "extra == 'playwright'", specifier = ">=1.27.0" }, + { name = "protego", specifier = ">=0.4.0" }, { name = "psutil", specifier = ">=6.0.0" }, { name = "pydantic", specifier = ">=2.8.0,!=2.10.0,!=2.10.1,!=2.10.2" }, { name = "pydantic-settings", specifier = ">=2.2.0,<2.7.0" }, @@ -1938,6 +1940,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/d3/c3cb8f1d6ae3b37f83e1de806713a9b3642c5895f0215a62e1a4bd6e5e34/propcache-0.3.1-py3-none-any.whl", hash = "sha256:9a8ecf38de50a7f518c21568c80f985e776397b902f1ce0b01f799aba1608b40", size = 12376 }, ] +[[package]] +name = "protego" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4e/6b/84e878d0567dfc11538bad6ce2595cee7ae0c47cf6bf7293683c9ec78ef8/protego-0.4.0.tar.gz", hash = "sha256:93a5e662b61399a0e1f208a324f2c6ea95b23ee39e6cbf2c96246da4a656c2f6", size = 3246425 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/fd/8d84d75832b0983cecf3aff7ae48362fe96fc8ab6ebca9dcf3cefd87e79c/Protego-0.4.0-py2.py3-none-any.whl", hash = "sha256:37640bc0ebe37572d624453a21381d05e9d86e44f89ff1e81794d185a0491666", size = 8553 }, +] + [[package]] name = "proxy-py" version = "2.4.10" From 538672e386af059cd3492dfd92f733664a6da985 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 22:13:15 +0000 Subject: [PATCH 05/20] add tests --- tests/unit/_utils/test_robots.py | 49 ++++++++++++++++++++++++++++++++ tests/unit/server.py | 24 ++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 tests/unit/_utils/test_robots.py diff --git a/tests/unit/_utils/test_robots.py b/tests/unit/_utils/test_robots.py new file mode 100644 index 0000000000..3e410b503c --- /dev/null +++ b/tests/unit/_utils/test_robots.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from crawlee._utils.robots import RobotsTxtFile + +if TYPE_CHECKING: + from yarl import URL + + from crawlee.http_clients._base import HttpClient + + +async def test_generation_robots_txt_url(server_url: URL, http_client: HttpClient) -> None: + robots_file = await RobotsTxtFile.find(str(server_url), http_client) + assert len(robots_file.get_sitemaps()) > 0 + + +async def test_allow_disallow_robots_txt(server_url: URL, http_client: HttpClient) -> None: + robots = await RobotsTxtFile.find(str(server_url), http_client) + assert robots.is_allowed('https://crawlee.dev') + assert robots.is_allowed(str(server_url / 'something/page.html')) + assert robots.is_allowed(str(server_url / 'deny_googlebot/page.html')) + assert not robots.is_allowed(str(server_url / 'deny_all/page.html')) + + +async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) -> None: + robots = await RobotsTxtFile.find(str(server_url), http_client) + assert len(robots.get_sitemaps()) == 2 + assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'} + + +async def test_parse_from_context() -> None: + content = """User-agent: * + Disallow: *deny_all/ + crawl-delay: 10 + User-agent: Googlebot + Disallow: *deny_googlebot/""" + robots = await RobotsTxtFile.from_content('http://not-exists.com/robots.txt', content) + assert robots.is_allowed('http://not-exists.com/something/page.html') + assert robots.is_allowed('http://not-exists.com/deny_googlebot/page.html') + assert not robots.is_allowed('http://not-exists.com/deny_googlebot/page.html', 'Googlebot') + assert not robots.is_allowed('http://not-exists.com/deny_all/page.html') + + +async def test_bind_robots_txt_url() -> None: + content = 'User-agent: *\nDisallow: /' + robots = await RobotsTxtFile.from_content('http://check.com/robots.txt', content) + assert not robots.is_allowed('http://check.com/test.html') + assert robots.is_allowed('http://othercheck.com/robots.txt') diff --git a/tests/unit/server.py b/tests/unit/server.py index 29e789d013..c180bc3fc0 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -120,6 +120,8 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: await hello_world_json(send) elif path.startswith('/xml'): await hello_world_xml(send) + elif path.startswith('/robots.txt'): + await robots_txt(send) else: await hello_world(send) @@ -366,6 +368,28 @@ async def dynamic_content(scope: dict[str, Any], send: Send) -> None: await send_html_response(send, html_content=content.encode()) +async def robots_txt(send: Send) -> None: + """Handle requests for the robots.txt file.""" + body = b'\n'.join( + [ + b'User-agent: *', + b'Disallow: *deny_all/', + b'crawl-delay: 10', + b'', + b'User-agent: Googlebot', + b'Disallow: *deny_googlebot/', + b'crawl-delay: 1', + b'', + b'user-agent: Mozilla', + b'crawl-delay: 2', + b'', + b'sitemap: http://not-exists.com/sitemap_1.xml', + b'sitemap: http://not-exists.com/sitemap_2.xml', + ] + ) + await send_html_response(send, body) + + class TestServer(Server): """A test HTTP server implementation based on Uvicorn Server.""" From b9b35be035a440824e38a099f87237c48de3d990 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 22:51:14 +0000 Subject: [PATCH 06/20] fix --- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 03fb78bb54..5ad104f57d 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -380,7 +380,7 @@ def __init__( self._additional_context_managers = _additional_context_managers or [] # Internal, not explicitly configurable components - self.robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000) + self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000) self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name) self._snapshotter = Snapshotter.from_config(config) self._autoscaled_pool = AutoscaledPool( @@ -1317,9 +1317,9 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: if not self._respect_robots_txt_file: return None origin_url = str(URL(url).origin()) - robots_txt_file = self.robots_txt_file_cache[origin_url] + robots_txt_file = self._robots_txt_file_cache.get(origin_url) if robots_txt_file: return robots_txt_file robots_txt_file = await RobotsTxtFile.find(url, self._http_client) - self.robots_txt_file_cache[origin_url] = robots_txt_file + self._robots_txt_file_cache[origin_url] = robots_txt_file return robots_txt_file From a49ab66d4e33427df7887870b41453cf4584e75a Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 22:51:48 +0000 Subject: [PATCH 07/20] update tests --- .../test_beautifulsoup_crawler.py | 18 ++++++++++++ .../crawlers/_parsel/test_parsel_crawler.py | 18 ++++++++++++ .../_playwright/test_playwright_crawler.py | 18 ++++++++++++ tests/unit/server.py | 28 ++++++------------- tests/unit/server_endpoints.py | 17 +++++++++++ 5 files changed, 80 insertions(+), 19 deletions(-) diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index 167391dc6f..b73ea4aeaa 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -142,3 +142,21 @@ async def test_handle_blocked_request(server_url: URL, http_client: HttpClient) def test_default_logger() -> None: assert BeautifulSoupCrawler().log.name == 'BeautifulSoupCrawler' + + +async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None: + crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links() + + await crawler.run([str(server_url / 'start_enqueue')]) + visited = {call[0][0] for call in visit.call_args_list} + + assert visited == { + str(server_url / 'start_enqueue'), + str(server_url / 'sub_index'), + } diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 477c091050..586962eac7 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -239,3 +239,21 @@ async def request_handler(context: ParselCrawlingContext) -> None: def test_default_logger() -> None: assert ParselCrawler().log.name == 'ParselCrawler' + + +async def test_respect_robots_txt(server_url: URL, http_client: HttpClient) -> None: + crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links() + + await crawler.run([str(server_url / 'start_enqueue')]) + visited = {call[0][0] for call in visit.call_args_list} + + assert visited == { + str(server_url / 'start_enqueue'), + str(server_url / 'sub_index'), + } diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 9123c30904..ec727d3ad8 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -563,3 +563,21 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: assert crawler.statistics.error_tracker.total == 3 * max_retries assert crawler.statistics.error_tracker.unique_error_count == 2 assert len(kvs_content) == 4 + + +async def test_respect_robots_txt(server_url: URL) -> None: + crawler = PlaywrightCrawler(respect_robots_txt_file=True) + visit = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + visit(context.request.url) + await context.enqueue_links() + + await crawler.run([str(server_url / 'start_enqueue')]) + visited = {call[0][0] for call in visit.call_args_list} + + assert visited == { + str(server_url / 'start_enqueue'), + str(server_url / 'sub_index'), + } diff --git a/tests/unit/server.py b/tests/unit/server.py index c180bc3fc0..21ba01cec8 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -11,7 +11,14 @@ from uvicorn.server import Server from yarl import URL -from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD, INCAPSULA, SECONDARY_INDEX, START_ENQUEUE +from tests.unit.server_endpoints import ( + GENERIC_RESPONSE, + HELLO_WORLD, + INCAPSULA, + ROBOTS_TXT, + SECONDARY_INDEX, + START_ENQUEUE, +) if TYPE_CHECKING: from socket import socket @@ -370,24 +377,7 @@ async def dynamic_content(scope: dict[str, Any], send: Send) -> None: async def robots_txt(send: Send) -> None: """Handle requests for the robots.txt file.""" - body = b'\n'.join( - [ - b'User-agent: *', - b'Disallow: *deny_all/', - b'crawl-delay: 10', - b'', - b'User-agent: Googlebot', - b'Disallow: *deny_googlebot/', - b'crawl-delay: 1', - b'', - b'user-agent: Mozilla', - b'crawl-delay: 2', - b'', - b'sitemap: http://not-exists.com/sitemap_1.xml', - b'sitemap: http://not-exists.com/sitemap_2.xml', - ] - ) - await send_html_response(send, body) + await send_html_response(send, ROBOTS_TXT) class TestServer(Server): diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index 00456d3dcd..a9f48e6e47 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -41,3 +41,20 @@ Insightful content """ + + +ROBOTS_TXT = b"""\ +User-agent: * +Disallow: *deny_all/ +Disallow: /page_ +crawl-delay: 10 + +User-agent: Googlebot +Disallow: *deny_googlebot/ +crawl-delay: 1 + +user-agent: Mozilla +crawl-delay: 2 + +sitemap: http://not-exists.com/sitemap_1.xml +sitemap: http://not-exists.com/sitemap_2.xml""" From 46a2356b6f78fa54d27af57453e6c5b4bfd3680a Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 23:02:50 +0000 Subject: [PATCH 08/20] update TODO comments --- .../crawlers/_abstract_http/_abstract_http_crawler.py | 3 ++- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 ++++-- src/crawlee/crawlers/_playwright/_playwright_crawler.py | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 1ca3600542..664fe81ba8 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -168,7 +168,8 @@ async def extract_links( url = convert_to_absolute_url(base_url, url) if robots_txt_file and not robots_txt_file.is_allowed(url): - # add processing with on_skiped_request callback or handler? + # TODO: https://github.com/apify/crawlee-python/issues/1160 + # add processing with on_skiped_request hook context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.') continue diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 5ad104f57d..d1631da1ad 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -666,7 +666,8 @@ async def add_requests( skipped.append(request) if skipped: - # add processing with on_skiped_request callback or handler? + # TODO: https://github.com/apify/crawlee-python/issues/1160 + # add processing with on_skiped_request hook self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file') request_manager = await self.get_request_manager() @@ -1116,7 +1117,8 @@ async def __run_task_function(self) -> None: logger=self._logger, max_retries=3, ) - # add processing with on_skiped_request callback or handler? + # TODO: https://github.com/apify/crawlee-python/issues/1160 + # add processing with on_skiped_request hook return if request.session_id: diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 1f2920eb57..7dff289142 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -303,7 +303,8 @@ async def extract_links( url = convert_to_absolute_url(base_url, url) if robots_txt_file and not robots_txt_file.is_allowed(url): - # add processing with on_skiped_request callback or handler? + # TODO: https://github.com/apify/crawlee-python/issues/1160 + # add processing with on_skiped_request hook context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.') continue From 10077b636df40cea91250f37dcbf2a057e00f2ec Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 23:23:13 +0000 Subject: [PATCH 09/20] update docstrings --- src/crawlee/_utils/robots.py | 37 ++++++++++++++++--- src/crawlee/crawlers/_basic/_basic_crawler.py | 15 ++++++-- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index a3ccdd3810..70a3458b94 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -19,18 +19,36 @@ def __init__(self, url: str, robots: Protego) -> None: @staticmethod async def from_content(url: str, content: str) -> RobotsTxtFile: + """Create a RobotsTxtFile instance from the given content. + + Args: + url: the URL of the robots.txt file + content: the content of the robots.txt file + """ robots = Protego.parse(content) return RobotsTxtFile(url, robots) @staticmethod async def find(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile: - """Find the robots.txt file for a given URL.""" + """Determine the location of a robots.txt file for a URL and fetch it. + + Args: + url: the URL to fetch robots.txt for + proxy_info: a `ProxyInfo` to be used for fetching the robots.txt file + http_client: the HTTP client to use for fetching the robots.txt file + """ robots_url = URL(url).with_path('/robots.txt') return await RobotsTxtFile.load(str(robots_url), http_client, proxy_info) @staticmethod async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile: - """Load the robots.txt file for a given URL.""" + """Load the robots.txt file for a given URL. + + Args: + url: the URL to fetch robots.txt for + proxy_info: a `ProxyInfo` to be used for fetching the robots.txt file + http_client: the HTTP client to use for fetching the robots.txt file + """ response = await http_client.send_request(url, proxy_info=proxy_info) body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read() @@ -39,17 +57,26 @@ async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = return RobotsTxtFile(url, robots) def is_allowed(self, url: str, user_agent: str = '*') -> bool: - """Check if the given URL is allowed for the given user agent.""" + """Check if the given URL is allowed for the given user agent. + + Args: + url: the URL to check + user_agent: the user agent to check for + """ check_url = URL(url) if check_url.origin() != self._original_url: return True return bool(self._robots.can_fetch(str(check_url), user_agent)) def get_sitemaps(self) -> list[str]: - """Get the list of sitemaps from the robots.txt file.""" + """Get the list of sitemaps urls from the robots.txt file.""" return list(self._robots.sitemaps) def get_crawl_delay(self, user_agent: str = '*') -> int | None: - """Get the crawl delay for the given user agent.""" + """Get the crawl delay for the given user agent. + + Args: + user_agent: the user-agent to check for + """ crawl_delay = self._robots.crawl_delay(user_agent) return int(crawl_delay) if crawl_delay is not None else None diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index d1631da1ad..3019425d3c 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -162,7 +162,8 @@ class _BasicCrawlerOptions(TypedDict): subclasses rather than direct instantiation of `BasicCrawler`.""" respect_robots_txt_file: NotRequired[bool] - """""" + """If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain, + and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`.""" class _BasicCrawlerOptionsGeneric(Generic[TCrawlingContext, TStatisticsState], TypedDict): @@ -1308,14 +1309,22 @@ def _check_request_collision(self, request: Request, session: Session | None) -> ) async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool: - """Check if the URL is allowed based on the robots.txt file.""" + """Check if the URL is allowed based on the robots.txt file. + + Args: + url: The URL to check. + """ if not self._respect_robots_txt_file: return True robots_txt_file = await self._get_robots_txt_file_for_url(url) return not robots_txt_file or robots_txt_file.is_allowed(url) async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: - """Get the RobotsTxtFile for a given URL.""" + """Get the RobotsTxtFile for a given URL. + + Args: + url: URL on the basis of which the RobotsTxtFile will be obtained. + """ if not self._respect_robots_txt_file: return None origin_url = str(URL(url).origin()) From b3e97890a4515ffe58a2e6da4611fe9debceea79 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Fri, 18 Apr 2025 02:44:43 +0300 Subject: [PATCH 10/20] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py | 3 +-- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- src/crawlee/crawlers/_playwright/_playwright_crawler.py | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 664fe81ba8..9abcb4c6f5 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -169,8 +169,7 @@ async def extract_links( if robots_txt_file and not robots_txt_file.is_allowed(url): # TODO: https://github.com/apify/crawlee-python/issues/1160 - # add processing with on_skiped_request hook - context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.') + # add processing with on_skipped_request hook continue request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 3019425d3c..9851b1771b 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -668,7 +668,7 @@ async def add_requests( if skipped: # TODO: https://github.com/apify/crawlee-python/issues/1160 - # add processing with on_skiped_request hook + # add processing with on_skipped_request hook self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file') request_manager = await self.get_request_manager() diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 7dff289142..8055ebc5e7 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -304,8 +304,7 @@ async def extract_links( if robots_txt_file and not robots_txt_file.is_allowed(url): # TODO: https://github.com/apify/crawlee-python/issues/1160 - # add processing with on_skiped_request hook - context.log.warning(f'Skipping URL "{url}" due to robots.txt rules.') + # add processing with on_skipped_request hook continue request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label}) From 4f4529e1bec25b89a8f0f8baf0c996cc8c2b7e34 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Wed, 23 Apr 2025 14:40:26 +0300 Subject: [PATCH 11/20] Update src/crawlee/crawlers/_basic/_basic_crawler.py Co-authored-by: Jan Buchar --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 9851b1771b..f4fae3cc84 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -162,7 +162,7 @@ class _BasicCrawlerOptions(TypedDict): subclasses rather than direct instantiation of `BasicCrawler`.""" respect_robots_txt_file: NotRequired[bool] - """If set to `true`, the crawler will automatically try to fetch the robots.txt file for each domain, + """If set to `True`, the crawler will automatically try to fetch the robots.txt file for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added via `EnqueueLinksFunction`.""" From 8973618554328864e34a5a26860e0a8a71d44daf Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 17 Apr 2025 23:27:18 +0000 Subject: [PATCH 12/20] fix docstrings --- src/crawlee/crawlers/_basic/_basic_crawler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index f4fae3cc84..60f2fdbe61 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -288,7 +288,9 @@ def __init__( configure_logging: If True, the crawler will set up logging infrastructure automatically. statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain text log messages. - respect_robots_txt_file: If True, the crawler will respect the robots.txt file of the target website. + respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file + for each domain, and skip those that are not allowed. This also prevents disallowed URLs to be added + via `EnqueueLinksFunction` _context_pipeline: Enables extending the request lifecycle and modifying the crawling context. Intended for use by subclasses rather than direct instantiation of `BasicCrawler`. _additional_context_managers: Additional context managers used throughout the crawler lifecycle. From 73a7bc6f2a90b000a6c5774eb629b18b219c0c99 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 23 Apr 2025 11:48:45 +0000 Subject: [PATCH 13/20] change staticmethod to classmethod --- src/crawlee/_utils/robots.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index 70a3458b94..f83666d66c 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -8,6 +8,8 @@ from crawlee._utils.web import is_status_code_client_error if TYPE_CHECKING: + from typing_extensions import Self + from crawlee.http_clients import HttpClient from crawlee.proxy_configuration import ProxyInfo @@ -17,8 +19,8 @@ def __init__(self, url: str, robots: Protego) -> None: self._robots = robots self._original_url = URL(url).origin() - @staticmethod - async def from_content(url: str, content: str) -> RobotsTxtFile: + @classmethod + async def from_content(cls, url: str, content: str) -> Self: """Create a RobotsTxtFile instance from the given content. Args: @@ -26,10 +28,10 @@ async def from_content(url: str, content: str) -> RobotsTxtFile: content: the content of the robots.txt file """ robots = Protego.parse(content) - return RobotsTxtFile(url, robots) + return cls(url, robots) - @staticmethod - async def find(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile: + @classmethod + async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self: """Determine the location of a robots.txt file for a URL and fetch it. Args: @@ -38,10 +40,10 @@ async def find(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = http_client: the HTTP client to use for fetching the robots.txt file """ robots_url = URL(url).with_path('/robots.txt') - return await RobotsTxtFile.load(str(robots_url), http_client, proxy_info) + return await cls.load(str(robots_url), http_client, proxy_info) - @staticmethod - async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> RobotsTxtFile: + @classmethod + async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = None) -> Self: """Load the robots.txt file for a given URL. Args: @@ -54,7 +56,7 @@ async def load(url: str, http_client: HttpClient, proxy_info: ProxyInfo | None = robots = Protego.parse(body.decode('utf-8')) - return RobotsTxtFile(url, robots) + return cls(url, robots) def is_allowed(self, url: str, user_agent: str = '*') -> bool: """Check if the given URL is allowed for the given user agent. From 8039fb5aaf6a8940e20323fc669c373c60a1fed1 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Wed, 23 Apr 2025 22:40:52 +0300 Subject: [PATCH 14/20] Update src/crawlee/_utils/robots.py Co-authored-by: Vlada Dusek --- src/crawlee/_utils/robots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index f83666d66c..41743a756b 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -21,7 +21,7 @@ def __init__(self, url: str, robots: Protego) -> None: @classmethod async def from_content(cls, url: str, content: str) -> Self: - """Create a RobotsTxtFile instance from the given content. + """Create a `RobotsTxtFile` instance from the given content. Args: url: the URL of the robots.txt file From 125804c513a2b4123371a9e85d79b81580ce444d Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 23 Apr 2025 20:59:58 +0000 Subject: [PATCH 15/20] add _robots_txt_locks_cache --- src/crawlee/crawlers/_basic/_basic_crawler.py | 20 ++++++++++++++++--- tests/unit/_utils/test_robots.py | 2 +- .../crawlers/_basic/test_basic_crawler.py | 15 +++++++++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 60f2fdbe61..9833015d37 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -384,6 +384,7 @@ def __init__( # Internal, not explicitly configurable components self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000) + self._robots_txt_locks_cache: LRUCache[str, asyncio.Lock] = LRUCache(maxsize=100) self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name) self._snapshotter = Snapshotter.from_config(config) self._autoscaled_pool = AutoscaledPool( @@ -1333,6 +1334,19 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: robots_txt_file = self._robots_txt_file_cache.get(origin_url) if robots_txt_file: return robots_txt_file - robots_txt_file = await RobotsTxtFile.find(url, self._http_client) - self._robots_txt_file_cache[origin_url] = robots_txt_file - return robots_txt_file + + txt_file_lock = self._robots_txt_locks_cache.get(origin_url) + if txt_file_lock is None: + txt_file_lock = asyncio.Lock() + self._robots_txt_locks_cache[origin_url] = txt_file_lock + + async with txt_file_lock: + # Check again if the robots.txt file is already cached after acquiring the lock + robots_txt_file = self._robots_txt_file_cache.get(origin_url) + if robots_txt_file: + return robots_txt_file + + # If not cached, fetch the robots.txt file + robots_txt_file = await RobotsTxtFile.find(url, self._http_client) + self._robots_txt_file_cache[origin_url] = robots_txt_file + return robots_txt_file diff --git a/tests/unit/_utils/test_robots.py b/tests/unit/_utils/test_robots.py index 3e410b503c..61dc60daa5 100644 --- a/tests/unit/_utils/test_robots.py +++ b/tests/unit/_utils/test_robots.py @@ -29,7 +29,7 @@ async def test_extract_sitemaps_urls(server_url: URL, http_client: HttpClient) - assert set(robots.get_sitemaps()) == {'http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml'} -async def test_parse_from_context() -> None: +async def test_parse_from_content() -> None: content = """User-agent: * Disallow: *deny_all/ crawl-delay: 10 diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index e0e2c84f60..f7bf678fd2 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -12,13 +12,14 @@ from datetime import timedelta from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, cast -from unittest.mock import AsyncMock, Mock, call +from unittest.mock import AsyncMock, Mock, call, patch import pytest from crawlee import ConcurrencySettings, Glob, service_locator from crawlee._request import Request from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders +from crawlee._utils.robots import RobotsTxtFile from crawlee.configuration import Configuration from crawlee.crawlers import BasicCrawler from crawlee.errors import RequestCollisionError, SessionError, UserDefinedErrorHandlerError @@ -1291,3 +1292,15 @@ async def error_req_hook(context: BasicCrawlingContext, error: Exception) -> Non await crawler.run(requests) assert error_handler_mock.call_count == 1 + + +async def test_lock_with_get_robots_txt_file_for_url(server_url: URL) -> None: + crawler = BasicCrawler(respect_robots_txt_file=True) + + with patch('crawlee.crawlers._basic._basic_crawler.RobotsTxtFile.find', wraps=RobotsTxtFile.find) as spy: + await asyncio.gather( + *[asyncio.create_task(crawler._get_robots_txt_file_for_url(str(server_url))) for _ in range(10)] + ) + + # Check that the lock was acquired only once + assert spy.call_count == 1 From 4b7346b715a9c1837e1f731b65e657192d9f9d6e Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 23 Apr 2025 21:21:46 +0000 Subject: [PATCH 16/20] update `pyproject.toml` --- pyproject.toml | 2 ++ src/crawlee/_utils/robots.py | 2 +- src/crawlee/storage_clients/_memory/_request_queue_client.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c87a9dc4b9..b014038131 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -237,7 +237,9 @@ module = [ "functions_framework", # Example code shows deploy on Google Cloud. "jaro", # Untyped and stubs not available "loguru", # Example code shows integration of loguru and crawlee for JSON logging. + "protego", # Untyped and stubs not available "sklearn.linear_model", # Untyped and stubs not available + "sortedcollections", # Untyped and stubs not available "cookiecutter.*", # Untyped and stubs not available "inquirer.*", # Untyped and stubs not available ] diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index 41743a756b..01349b49ff 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING -from protego import Protego # type: ignore[import-untyped] +from protego import Protego from yarl import URL from crawlee._utils.web import is_status_code_client_error diff --git a/src/crawlee/storage_clients/_memory/_request_queue_client.py b/src/crawlee/storage_clients/_memory/_request_queue_client.py index 0031e54abd..477d53df07 100644 --- a/src/crawlee/storage_clients/_memory/_request_queue_client.py +++ b/src/crawlee/storage_clients/_memory/_request_queue_client.py @@ -8,7 +8,7 @@ from logging import getLogger from typing import TYPE_CHECKING -from sortedcollections import ValueSortedDict # type: ignore[import-untyped] +from sortedcollections import ValueSortedDict from typing_extensions import override from crawlee._types import StorageTypes From e6099edf0f98eb6750aeae5f52431a495a2435bb Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 23 Apr 2025 22:00:21 +0000 Subject: [PATCH 17/20] update docstrings --- src/crawlee/_utils/robots.py | 23 ++++++++++--------- src/crawlee/crawlers/_basic/_basic_crawler.py | 4 ++-- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py index 01349b49ff..930ae09431 100644 --- a/src/crawlee/_utils/robots.py +++ b/src/crawlee/_utils/robots.py @@ -24,8 +24,8 @@ async def from_content(cls, url: str, content: str) -> Self: """Create a `RobotsTxtFile` instance from the given content. Args: - url: the URL of the robots.txt file - content: the content of the robots.txt file + url: The URL associated with the robots.txt file. + content: The raw string content of the robots.txt file to be parsed. """ robots = Protego.parse(content) return cls(url, robots) @@ -35,9 +35,9 @@ async def find(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N """Determine the location of a robots.txt file for a URL and fetch it. Args: - url: the URL to fetch robots.txt for - proxy_info: a `ProxyInfo` to be used for fetching the robots.txt file - http_client: the HTTP client to use for fetching the robots.txt file + url: The URL whose domain will be used to find the corresponding robots.txt file. + http_client: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used. + proxy_info: The `HttpClient` instance used to perform the network request for fetching the robots.txt file. """ robots_url = URL(url).with_path('/robots.txt') return await cls.load(str(robots_url), http_client, proxy_info) @@ -47,9 +47,9 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N """Load the robots.txt file for a given URL. Args: - url: the URL to fetch robots.txt for - proxy_info: a `ProxyInfo` to be used for fetching the robots.txt file - http_client: the HTTP client to use for fetching the robots.txt file + url: The direct URL of the robots.txt file to be loaded. + http_client: The `HttpClient` instance used to perform the network request for fetching the robots.txt file. + proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used. """ response = await http_client.send_request(url, proxy_info=proxy_info) body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read() @@ -62,8 +62,8 @@ def is_allowed(self, url: str, user_agent: str = '*') -> bool: """Check if the given URL is allowed for the given user agent. Args: - url: the URL to check - user_agent: the user agent to check for + url: The URL to check against the robots.txt rules. + user_agent: The user-agent string to check permissions for. Defaults to '*' which matches any user-agent. """ check_url = URL(url) if check_url.origin() != self._original_url: @@ -78,7 +78,8 @@ def get_crawl_delay(self, user_agent: str = '*') -> int | None: """Get the crawl delay for the given user agent. Args: - user_agent: the user-agent to check for + user_agent: The user-agent string to check the crawl delay for. Defaults to '*' which matches any + user-agent. """ crawl_delay = self._robots.crawl_delay(user_agent) return int(crawl_delay) if crawl_delay is not None else None diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 9833015d37..e4d49c036e 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1315,7 +1315,7 @@ async def _is_allowed_based_on_robots_txt_file(self, url: str) -> bool: """Check if the URL is allowed based on the robots.txt file. Args: - url: The URL to check. + url: The URL to verify against robots.txt rules. Returns True if crawling this URL is permitted. """ if not self._respect_robots_txt_file: return True @@ -1326,7 +1326,7 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: """Get the RobotsTxtFile for a given URL. Args: - url: URL on the basis of which the RobotsTxtFile will be obtained. + url: The URL whose domain will be used to locate and fetch the corresponding robots.txt file. """ if not self._respect_robots_txt_file: return None From 41b803dfca67e4de826853eecab6f7b4f1f897e3 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 24 Apr 2025 00:48:52 +0000 Subject: [PATCH 18/20] add docs example --- .../code_examples/respect_robots_txt_file.py | 27 +++++++++++++++++++ docs/examples/respect_robots_txt_file.mdx | 21 +++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 docs/examples/code_examples/respect_robots_txt_file.py create mode 100644 docs/examples/respect_robots_txt_file.mdx diff --git a/docs/examples/code_examples/respect_robots_txt_file.py b/docs/examples/code_examples/respect_robots_txt_file.py new file mode 100644 index 0000000000..ebd63b1c2e --- /dev/null +++ b/docs/examples/code_examples/respect_robots_txt_file.py @@ -0,0 +1,27 @@ +import asyncio + +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Initialize the crawler with robots.txt compliance enabled + crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # Start the crawler with the specified URLs + # The crawler will check the robots.txt file before making requests + # In this example, 'https://news.ycombinator.com/login' will be skipped + # because it's disallowed in the site's robots.txt file + await crawler.run( + ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/examples/respect_robots_txt_file.mdx b/docs/examples/respect_robots_txt_file.mdx new file mode 100644 index 0000000000..5f6194c919 --- /dev/null +++ b/docs/examples/respect_robots_txt_file.mdx @@ -0,0 +1,21 @@ +--- +id: respect-robots-txt-file +title: Respect robots.txt file +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py'; + +This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file. + +To configure `Crawlee` to follow the `robots.txt` file, set the parameter `respect_robots_txt_file=True` in `BasicCrawlerOptions`. In this case, `Crawlee` will skip any URLs forbidden in the website's robots.txt file. + +As an example, let's look at the website `https://news.ycombinator.com/` and its corresponding [robots.txt](https://news.ycombinator.com/robots.txt) file. Since the file has a rule `Disallow: /login`, the URL `https://news.ycombinator.com/login` will be automatically skipped. + +The code below demonstrates this behavior using the `BeautifulSoupCrawler`: + + + {RespectRobotsTxt} + From 8f25d838ff095ae38ea6dcd64accdf143e06e982 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 24 Apr 2025 01:08:33 +0000 Subject: [PATCH 19/20] update comment --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index e4d49c036e..99e4045a9e 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1122,7 +1122,7 @@ async def __run_task_function(self) -> None: max_retries=3, ) # TODO: https://github.com/apify/crawlee-python/issues/1160 - # add processing with on_skiped_request hook + # add processing with on_skipped_request hook return if request.session_id: From 260ad923da62f663485d63770137758285b6b494 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 24 Apr 2025 11:45:54 +0000 Subject: [PATCH 20/20] one lock to rule them all --- src/crawlee/crawlers/_basic/_basic_crawler.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 90800c6c4e..49b28c043e 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -384,7 +384,7 @@ def __init__( # Internal, not explicitly configurable components self._robots_txt_file_cache: LRUCache[str, RobotsTxtFile] = LRUCache(maxsize=1000) - self._robots_txt_locks_cache: LRUCache[str, asyncio.Lock] = LRUCache(maxsize=100) + self._robots_txt_lock = asyncio.Lock() self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name) self._snapshotter = Snapshotter.from_config(config) self._autoscaled_pool = AutoscaledPool( @@ -1333,12 +1333,7 @@ async def _get_robots_txt_file_for_url(self, url: str) -> RobotsTxtFile | None: if robots_txt_file: return robots_txt_file - txt_file_lock = self._robots_txt_locks_cache.get(origin_url) - if txt_file_lock is None: - txt_file_lock = asyncio.Lock() - self._robots_txt_locks_cache[origin_url] = txt_file_lock - - async with txt_file_lock: + async with self._robots_txt_lock: # Check again if the robots.txt file is already cached after acquiring the lock robots_txt_file = self._robots_txt_file_cache.get(origin_url) if robots_txt_file: