apify · Pijukatel · Apr 24, 2025 · Apr 10, 2025 · Apr 23, 2025 · Apr 23, 2025
diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py
@@ -0,0 +1,47 @@
+import asyncio
+import json
+
+from crawlee import HttpHeaders
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+from crawlee.errors import HttpStatusCodeError
+from crawlee.sessions import SessionPool
+
+# Using a placeholder refresh token for this example
+REFRESH_TOKEN = 'PLACEHOLDER'
+UNAUTHORIZED_CODE = 401
+
+
+async def main() -> None:
+    crawler = HttpCrawler(
+        max_request_retries=2,
+        # Only treat 403 as a blocking status code, not 401
+        session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}),
+        # Don't treat 401 responses as errors
+        ignore_http_error_status_codes=[UNAUTHORIZED_CODE],
+    )
+
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        # Now we can handle 401 responses ourselves
+        if context.http_response.status_code == UNAUTHORIZED_CODE:
+            # Get a fresh access token
+            headers = {'authorization': f'Bearer {REFRESH_TOKEN}'}
+            response = await context.send_request(
+                'https://placeholder.org/refresh', headers=headers
+            )
+            data = json.loads(response.read())
+            # Add the new token to our `Request` headers
+            new_headers = {
+                **context.request.headers,
+                'authorization': f'Bearer {data["access_token"]}',
+            }
+            context.request.headers = HttpHeaders(new_headers)
+            # Trigger a retry with our updated headers
+            raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE)
+
+    await crawler.run(['http://httpbingo.org/status/401'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/error_handling/disable_retry.py b/docs/guides/code_examples/error_handling/disable_retry.py
@@ -0,0 +1,30 @@
+import asyncio
+
+from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
+from crawlee.errors import HttpStatusCodeError, SessionError
+
+
+async def main() -> None:
+    crawler = HttpCrawler(max_request_retries=5)
+
+    # Create a parsing error for demonstration
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        raise ValueError('Simulated parsing error')
+
+    # This handler runs before any retry attempts
+    @crawler.error_handler
+    async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None:
+        context.log.error(f'Failed request {context.request.url}')
+        # Only allow retries for network-related errors
+        if not isinstance(error, (SessionError, HttpStatusCodeError)):
+            context.log.error('Non-network error detected')
+            # Stop further retry attempts for this `Request`
+            context.request.no_retry = True
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/error_handling/handle_proxy_error.py b/docs/guides/code_examples/error_handling/handle_proxy_error.py
@@ -0,0 +1,40 @@
+import asyncio
+
+from crawlee import Request
+from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
+from crawlee.errors import ProxyError
+
+
+async def main() -> None:
+    # Set how many session rotations will happen before calling the error handler
+    # when ProxyError occurs
+    crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6)
+
+    # For this example, we'll create a proxy error in our handler
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        raise ProxyError('Simulated proxy error')
+
+    # This handler runs after all retry attempts are exhausted
+    @crawler.failed_request_handler
+    async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None:
+        context.log.error(f'Failed request {context.request.url}, after 5 rotations')
+        request = context.request
+        # For proxy errors, we can add a new `Request` to try again
+        if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'):
+            context.log.info(f'Retrying {request.url} ...')
+            # Create a new `Request` with a modified key to avoid deduplication
+            new_request = Request.from_url(
+                request.url, unique_key=f'retry{request.unique_key}'
+            )
+
+            # Add the new `Request` to the `Queue`
+            rq = await crawler.get_request_manager()
+            await rq.add_request(new_request)
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/error_handling.mdx b/docs/guides/error_handling.mdx
@@ -0,0 +1,44 @@
+---
+id: error-handling
+title: Error handling
+description: How to handle errors that occur during web crawling.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+
+import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py';
+import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py';
+import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py';
+
+This guide demonstrates techniques for handling common errors encountered during web crawling operations.
+
+## Handling proxy errors
+
+Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. If you can't get data because of proxy errors, you might want to try again. You can do this using <ApiLink to="class/BasicCrawler#failed_request_handler">`failed_request_handler`</ApiLink>:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {HandleProxyError}
+</RunnableCodeBlock>
+
+You can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many.
+
+## Changing how error status codes are handled
+
+By default, when <ApiLink to="class/Session">`Sessions`</ApiLink> get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the <ApiLink to="class/Session">`Session`</ApiLink> as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management).
+
+Here's an example of how to change this behavior:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {ChangeHandleErrorStatus}
+</RunnableCodeBlock>
+
+## Turning off retries for non-network errors
+
+Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that.
+
+Here's how to turn off retries for non-network errors using <ApiLink to="class/BasicCrawler#error_handler">`error_handler`</ApiLink>, which runs before Crawlee tries again:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {DisableRetry}
+</RunnableCodeBlock>