From 5d485a132ef466345581e9d2b4770a9c8b4efca9 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Thu, 10 Apr 2025 20:31:55 +0000 Subject: [PATCH 1/4] add docs about error handling --- .../change_handle_error_status.py | 47 +++++++++++++++++++ .../error_handling/disable_retry.py | 30 ++++++++++++ .../error_handling/handle_proxy_error.py | 40 ++++++++++++++++ docs/guides/error_handling.mdx | 44 +++++++++++++++++ 4 files changed, 161 insertions(+) create mode 100644 docs/guides/code_examples/error_handling/change_handle_error_status.py create mode 100644 docs/guides/code_examples/error_handling/disable_retry.py create mode 100644 docs/guides/code_examples/error_handling/handle_proxy_error.py create mode 100644 docs/guides/error_handling.mdx diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py new file mode 100644 index 0000000000..7314ca799f --- /dev/null +++ b/docs/guides/code_examples/error_handling/change_handle_error_status.py @@ -0,0 +1,47 @@ +import asyncio +import json + +from crawlee import HttpHeaders +from crawlee.crawlers import HttpCrawler, HttpCrawlingContext +from crawlee.errors import HttpStatusCodeError +from crawlee.sessions import SessionPool + +# Using a placeholder refresh token for this example +REFRESH_TOKEN = 'PLACEHOLDER' +UNAUTHORIZED_STATUS_CODE = 401 + + +async def main() -> None: + crawler = HttpCrawler( + max_request_retries=2, + # Only treat 403 as a blocking status code, not 401 + session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}), + # Don't treat 401 responses as errors + ignore_http_error_status_codes=[401], + ) + + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + # Now we can handle 401 responses ourselves + if context.http_response.status_code == UNAUTHORIZED_STATUS_CODE: + # Get a fresh access token + headers = {'authorization': f'Bearer {REFRESH_TOKEN}'} + response = await context.send_request( + 'https://placeholder.org/refresh', headers=headers + ) + data = json.loads(response.read()) + # Add the new token to our `Request` headers + new_headers = { + **context.request.headers, + 'authorization': f'Bearer {data["access_token"]}', + } + context.request.headers = HttpHeaders(new_headers) + # Trigger a retry with our updated headers + raise HttpStatusCodeError('Unauthorized', status_code=401) + + await crawler.run(['http://httpbingo.org/status/401']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/error_handling/disable_retry.py b/docs/guides/code_examples/error_handling/disable_retry.py new file mode 100644 index 0000000000..8d98eff312 --- /dev/null +++ b/docs/guides/code_examples/error_handling/disable_retry.py @@ -0,0 +1,30 @@ +import asyncio + +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import HttpStatusCodeError, SessionError + + +async def main() -> None: + crawler = HttpCrawler(max_request_retries=5) + + # Create a parsing error for demonstration + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + raise ValueError('Simulated parsing error') + + # This handler runs before any retry attempts + @crawler.error_handler + async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error(f'Failed request {context.request.url}') + # Only allow retries for network-related errors + if not isinstance(error, (SessionError, HttpStatusCodeError)): + context.log.error('Non-network error detected') + # Stop further retry attempts for this `Request` + context.request.no_retry = True + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/code_examples/error_handling/handle_proxy_error.py b/docs/guides/code_examples/error_handling/handle_proxy_error.py new file mode 100644 index 0000000000..c5c4d5dac9 --- /dev/null +++ b/docs/guides/code_examples/error_handling/handle_proxy_error.py @@ -0,0 +1,40 @@ +import asyncio + +from crawlee import Request +from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext +from crawlee.errors import ProxyError + + +async def main() -> None: + # Set how many session rotations will happen before calling the error handler + # when ProxyError occurs + crawler = HttpCrawler(max_session_rotations=5) + + # For this example, we'll create a proxy error in our handler + @crawler.router.default_handler + async def default_handler(context: HttpCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + raise ProxyError('Simulated proxy error') + + # This handler runs after all retry attempts are exhausted + @crawler.failed_request_handler + async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None: + context.log.error(f'Failed request {context.request.url}, after 5 rotations') + request = context.request + # For proxy errors, we can add a new `Request` to try again + if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'): + context.log.info(f'Retrying {request.url} ...') + # Create a new `Request` with a modified key to avoid deduplication + new_request = Request.from_url( + request.url, unique_key=f'retry{request.unique_key}' + ) + + # Add the new `Request` to the `Queue` + rq = await crawler.get_request_manager() + await rq.add_request(new_request) + + await crawler.run(['https://crawlee.dev/']) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/guides/error_handling.mdx b/docs/guides/error_handling.mdx new file mode 100644 index 0000000000..4fb2caab14 --- /dev/null +++ b/docs/guides/error_handling.mdx @@ -0,0 +1,44 @@ +--- +id: error-handling +title: Error handling +description: How to handle errors that occur during web crawling. +--- + +import ApiLink from '@site/src/components/ApiLink'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py'; +import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py'; +import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py'; + +This guide shows you how to handle common errors that happen when crawling websites. + +## Handling proxy errors + +Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in BasicCrawlerOptions. If you can't get data because of proxy errors, you might want to try again. You can do this using failed_request_handler: + + + {HandleProxyError} + + +You can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many. + +## Changing how error status codes are handled + +By default, when Sessions get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the Session as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [session management guide](./session-management). + +Here's an example of how to change this behavior: + + + {ChangeHandleErrorStatus} + + +## Turning off retries for non-network errors + +Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that. + +Here's how to turn off retries for non-network errors using error_handler, which runs before `Crawlee` tries again: + + + {DisableRetry} + From 3741ac030d6985397ce8f8c24d058b567a47a7b4 Mon Sep 17 00:00:00 2001 From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com> Date: Wed, 23 Apr 2025 15:53:15 +0300 Subject: [PATCH 2/4] Apply suggestions from code review Co-authored-by: Vlada Dusek --- docs/guides/error_handling.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/guides/error_handling.mdx b/docs/guides/error_handling.mdx index 4fb2caab14..abd1b33058 100644 --- a/docs/guides/error_handling.mdx +++ b/docs/guides/error_handling.mdx @@ -11,11 +11,11 @@ import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_hand import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py'; import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py'; -This guide shows you how to handle common errors that happen when crawling websites. +This guide demonstrates techniques for handling common errors encountered during web crawling operations. ## Handling proxy errors -Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in BasicCrawlerOptions. If you can't get data because of proxy errors, you might want to try again. You can do this using failed_request_handler: +Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in `BasicCrawlerOptions`. If you can't get data because of proxy errors, you might want to try again. You can do this using `failed_request_handler`: {HandleProxyError} @@ -25,7 +25,7 @@ You can use this same approach when testing different proxy providers. To better ## Changing how error status codes are handled -By default, when Sessions get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the Session as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [session management guide](./session-management). +By default, when `Sessions` get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the `Session` as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management). Here's an example of how to change this behavior: @@ -37,7 +37,7 @@ Here's an example of how to change this behavior: Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that. -Here's how to turn off retries for non-network errors using error_handler, which runs before `Crawlee` tries again: +Here's how to turn off retries for non-network errors using `error_handler`, which runs before Crawlee tries again: {DisableRetry} From 3e9e47e7c4669e2b2b08d94989519a78b023b1ce Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 23 Apr 2025 13:12:18 +0000 Subject: [PATCH 3/4] update --- .../error_handling/change_handle_error_status.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py index 7314ca799f..3b721545b2 100644 --- a/docs/guides/code_examples/error_handling/change_handle_error_status.py +++ b/docs/guides/code_examples/error_handling/change_handle_error_status.py @@ -8,7 +8,7 @@ # Using a placeholder refresh token for this example REFRESH_TOKEN = 'PLACEHOLDER' -UNAUTHORIZED_STATUS_CODE = 401 +UNAUTHORIZED_CODE = 401 async def main() -> None: @@ -17,14 +17,14 @@ async def main() -> None: # Only treat 403 as a blocking status code, not 401 session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}), # Don't treat 401 responses as errors - ignore_http_error_status_codes=[401], + ignore_http_error_status_codes=[UNAUTHORIZED_CODE], ) @crawler.router.default_handler async def default_handler(context: HttpCrawlingContext) -> None: context.log.info(f'Processing {context.request.url} ...') # Now we can handle 401 responses ourselves - if context.http_response.status_code == UNAUTHORIZED_STATUS_CODE: + if context.http_response.status_code == UNAUTHORIZED_CODE: # Get a fresh access token headers = {'authorization': f'Bearer {REFRESH_TOKEN}'} response = await context.send_request( @@ -38,7 +38,7 @@ async def default_handler(context: HttpCrawlingContext) -> None: } context.request.headers = HttpHeaders(new_headers) # Trigger a retry with our updated headers - raise HttpStatusCodeError('Unauthorized', status_code=401) + raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE) await crawler.run(['http://httpbingo.org/status/401']) From f3e6616f0c1ad900e78f221ecaa5966c95665f8a Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Wed, 23 Apr 2025 15:45:30 +0000 Subject: [PATCH 4/4] up example --- docs/guides/code_examples/error_handling/handle_proxy_error.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/code_examples/error_handling/handle_proxy_error.py b/docs/guides/code_examples/error_handling/handle_proxy_error.py index c5c4d5dac9..eddb843fdd 100644 --- a/docs/guides/code_examples/error_handling/handle_proxy_error.py +++ b/docs/guides/code_examples/error_handling/handle_proxy_error.py @@ -8,7 +8,7 @@ async def main() -> None: # Set how many session rotations will happen before calling the error handler # when ProxyError occurs - crawler = HttpCrawler(max_session_rotations=5) + crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6) # For this example, we'll create a proxy error in our handler @crawler.router.default_handler