From 5d485a132ef466345581e9d2b4770a9c8b4efca9 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Thu, 10 Apr 2025 20:31:55 +0000
Subject: [PATCH 1/4] add docs about error handling

---
 .../change_handle_error_status.py             | 47 +++++++++++++++++++
 .../error_handling/disable_retry.py           | 30 ++++++++++++
 .../error_handling/handle_proxy_error.py      | 40 ++++++++++++++++
 docs/guides/error_handling.mdx                | 44 +++++++++++++++++
 4 files changed, 161 insertions(+)
 create mode 100644 docs/guides/code_examples/error_handling/change_handle_error_status.py
 create mode 100644 docs/guides/code_examples/error_handling/disable_retry.py
 create mode 100644 docs/guides/code_examples/error_handling/handle_proxy_error.py
 create mode 100644 docs/guides/error_handling.mdx

diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py
new file mode 100644
index 0000000000..7314ca799f
--- /dev/null
+++ b/docs/guides/code_examples/error_handling/change_handle_error_status.py
@@ -0,0 +1,47 @@
+import asyncio
+import json
+
+from crawlee import HttpHeaders
+from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
+from crawlee.errors import HttpStatusCodeError
+from crawlee.sessions import SessionPool
+
+# Using a placeholder refresh token for this example
+REFRESH_TOKEN = 'PLACEHOLDER'
+UNAUTHORIZED_STATUS_CODE = 401
+
+
+async def main() -> None:
+    crawler = HttpCrawler(
+        max_request_retries=2,
+        # Only treat 403 as a blocking status code, not 401
+        session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}),
+        # Don't treat 401 responses as errors
+        ignore_http_error_status_codes=[401],
+    )
+
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        # Now we can handle 401 responses ourselves
+        if context.http_response.status_code == UNAUTHORIZED_STATUS_CODE:
+            # Get a fresh access token
+            headers = {'authorization': f'Bearer {REFRESH_TOKEN}'}
+            response = await context.send_request(
+                'https://placeholder.org/refresh', headers=headers
+            )
+            data = json.loads(response.read())
+            # Add the new token to our `Request` headers
+            new_headers = {
+                **context.request.headers,
+                'authorization': f'Bearer {data["access_token"]}',
+            }
+            context.request.headers = HttpHeaders(new_headers)
+            # Trigger a retry with our updated headers
+            raise HttpStatusCodeError('Unauthorized', status_code=401)
+
+    await crawler.run(['http://httpbingo.org/status/401'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/error_handling/disable_retry.py b/docs/guides/code_examples/error_handling/disable_retry.py
new file mode 100644
index 0000000000..8d98eff312
--- /dev/null
+++ b/docs/guides/code_examples/error_handling/disable_retry.py
@@ -0,0 +1,30 @@
+import asyncio
+
+from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
+from crawlee.errors import HttpStatusCodeError, SessionError
+
+
+async def main() -> None:
+    crawler = HttpCrawler(max_request_retries=5)
+
+    # Create a parsing error for demonstration
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        raise ValueError('Simulated parsing error')
+
+    # This handler runs before any retry attempts
+    @crawler.error_handler
+    async def retry_handler(context: BasicCrawlingContext, error: Exception) -> None:
+        context.log.error(f'Failed request {context.request.url}')
+        # Only allow retries for network-related errors
+        if not isinstance(error, (SessionError, HttpStatusCodeError)):
+            context.log.error('Non-network error detected')
+            # Stop further retry attempts for this `Request`
+            context.request.no_retry = True
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/code_examples/error_handling/handle_proxy_error.py b/docs/guides/code_examples/error_handling/handle_proxy_error.py
new file mode 100644
index 0000000000..c5c4d5dac9
--- /dev/null
+++ b/docs/guides/code_examples/error_handling/handle_proxy_error.py
@@ -0,0 +1,40 @@
+import asyncio
+
+from crawlee import Request
+from crawlee.crawlers import BasicCrawlingContext, HttpCrawler, HttpCrawlingContext
+from crawlee.errors import ProxyError
+
+
+async def main() -> None:
+    # Set how many session rotations will happen before calling the error handler
+    # when ProxyError occurs
+    crawler = HttpCrawler(max_session_rotations=5)
+
+    # For this example, we'll create a proxy error in our handler
+    @crawler.router.default_handler
+    async def default_handler(context: HttpCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        raise ProxyError('Simulated proxy error')
+
+    # This handler runs after all retry attempts are exhausted
+    @crawler.failed_request_handler
+    async def failed_handler(context: BasicCrawlingContext, error: Exception) -> None:
+        context.log.error(f'Failed request {context.request.url}, after 5 rotations')
+        request = context.request
+        # For proxy errors, we can add a new `Request` to try again
+        if isinstance(error, ProxyError) and not request.unique_key.startswith('retry'):
+            context.log.info(f'Retrying {request.url} ...')
+            # Create a new `Request` with a modified key to avoid deduplication
+            new_request = Request.from_url(
+                request.url, unique_key=f'retry{request.unique_key}'
+            )
+
+            # Add the new `Request` to the `Queue`
+            rq = await crawler.get_request_manager()
+            await rq.add_request(new_request)
+
+    await crawler.run(['https://crawlee.dev/'])
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/docs/guides/error_handling.mdx b/docs/guides/error_handling.mdx
new file mode 100644
index 0000000000..4fb2caab14
--- /dev/null
+++ b/docs/guides/error_handling.mdx
@@ -0,0 +1,44 @@
+---
+id: error-handling
+title: Error handling
+description: How to handle errors that occur during web crawling.
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
+
+import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_handling/handle_proxy_error.py';
+import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py';
+import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py';
+
+This guide shows you how to handle common errors that happen when crawling websites.
+
+## Handling proxy errors
+
+Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in <ApiLink to="class/BasicCrawlerOptions">BasicCrawlerOptions</ApiLink>. If you can't get data because of proxy errors, you might want to try again. You can do this using <ApiLink to="class/BasicCrawler#failed_request_handler">failed_request_handler</ApiLink>:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {HandleProxyError}
+</RunnableCodeBlock>
+
+You can use this same approach when testing different proxy providers. To better manage this process, you can count proxy errors and [stop the crawler](../examples/crawler-stop) if you get too many.
+
+## Changing how error status codes are handled
+
+By default, when <ApiLink to="class/Session">Sessions</ApiLink> get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the <ApiLink to="class/Session">Session</ApiLink> as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [session management guide](./session-management).
+
+Here's an example of how to change this behavior:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {ChangeHandleErrorStatus}
+</RunnableCodeBlock>
+
+## Turning off retries for non-network errors
+
+Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that.
+
+Here's how to turn off retries for non-network errors using <ApiLink to="class/BasicCrawler#error_handler">error_handler</ApiLink>, which runs before `Crawlee` tries again:
+
+<RunnableCodeBlock className="language-python" language="python">
+    {DisableRetry}
+</RunnableCodeBlock>

From 3741ac030d6985397ce8f8c24d058b567a47a7b4 Mon Sep 17 00:00:00 2001
From: Max Bohomolov <34358312+Mantisus@users.noreply.github.com>
Date: Wed, 23 Apr 2025 15:53:15 +0300
Subject: [PATCH 2/4] Apply suggestions from code review

Co-authored-by: Vlada Dusek <v.dusek96@gmail.com>
---
 docs/guides/error_handling.mdx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/guides/error_handling.mdx b/docs/guides/error_handling.mdx
index 4fb2caab14..abd1b33058 100644
--- a/docs/guides/error_handling.mdx
+++ b/docs/guides/error_handling.mdx
@@ -11,11 +11,11 @@ import HandleProxyError from '!!raw-loader!roa-loader!./code_examples/error_hand
 import ChangeHandleErrorStatus from '!!raw-loader!roa-loader!./code_examples/error_handling/change_handle_error_status.py';
 import DisableRetry from '!!raw-loader!roa-loader!./code_examples/error_handling/disable_retry.py';
 
-This guide shows you how to handle common errors that happen when crawling websites.
+This guide demonstrates techniques for handling common errors encountered during web crawling operations.
 
 ## Handling proxy errors
 
-Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in <ApiLink to="class/BasicCrawlerOptions">BasicCrawlerOptions</ApiLink>. If you can't get data because of proxy errors, you might want to try again. You can do this using <ApiLink to="class/BasicCrawler#failed_request_handler">failed_request_handler</ApiLink>:
+Low-quality proxies can cause problems even with high settings for `max_request_retries` and `max_session_rotations` in <ApiLink to="class/BasicCrawlerOptions">`BasicCrawlerOptions`</ApiLink>. If you can't get data because of proxy errors, you might want to try again. You can do this using <ApiLink to="class/BasicCrawler#failed_request_handler">`failed_request_handler`</ApiLink>:
 
 <RunnableCodeBlock className="language-python" language="python">
     {HandleProxyError}
@@ -25,7 +25,7 @@ You can use this same approach when testing different proxy providers. To better
 
 ## Changing how error status codes are handled
 
-By default, when <ApiLink to="class/Session">Sessions</ApiLink> get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the <ApiLink to="class/Session">Session</ApiLink> as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [session management guide](./session-management).
+By default, when <ApiLink to="class/Session">`Sessions`</ApiLink> get status codes like [401](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/401), [403](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/403), or [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status/429), Crawlee marks the <ApiLink to="class/Session">`Session`</ApiLink> as `retire` and switches to a new one. This might not be what you want, especially when working with [authentication](./logging-in-with-a-crawler). You can learn more in the [Session management guide](./session-management).
 
 Here's an example of how to change this behavior:
 
@@ -37,7 +37,7 @@ Here's an example of how to change this behavior:
 
 Sometimes you might get unexpected errors when parsing data, like when a website has an unusual structure. Crawlee normally tries again based on your `max_request_retries` setting, but sometimes you don't want that.
 
-Here's how to turn off retries for non-network errors using <ApiLink to="class/BasicCrawler#error_handler">error_handler</ApiLink>, which runs before `Crawlee` tries again:
+Here's how to turn off retries for non-network errors using <ApiLink to="class/BasicCrawler#error_handler">`error_handler`</ApiLink>, which runs before Crawlee tries again:
 
 <RunnableCodeBlock className="language-python" language="python">
     {DisableRetry}

From 3e9e47e7c4669e2b2b08d94989519a78b023b1ce Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 23 Apr 2025 13:12:18 +0000
Subject: [PATCH 3/4] update

---
 .../error_handling/change_handle_error_status.py          | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py
index 7314ca799f..3b721545b2 100644
--- a/docs/guides/code_examples/error_handling/change_handle_error_status.py
+++ b/docs/guides/code_examples/error_handling/change_handle_error_status.py
@@ -8,7 +8,7 @@
 
 # Using a placeholder refresh token for this example
 REFRESH_TOKEN = 'PLACEHOLDER'
-UNAUTHORIZED_STATUS_CODE = 401
+UNAUTHORIZED_CODE = 401
 
 
 async def main() -> None:
@@ -17,14 +17,14 @@ async def main() -> None:
         # Only treat 403 as a blocking status code, not 401
         session_pool=SessionPool(create_session_settings={'blocked_status_codes': [403]}),
         # Don't treat 401 responses as errors
-        ignore_http_error_status_codes=[401],
+        ignore_http_error_status_codes=[UNAUTHORIZED_CODE],
     )
 
     @crawler.router.default_handler
     async def default_handler(context: HttpCrawlingContext) -> None:
         context.log.info(f'Processing {context.request.url} ...')
         # Now we can handle 401 responses ourselves
-        if context.http_response.status_code == UNAUTHORIZED_STATUS_CODE:
+        if context.http_response.status_code == UNAUTHORIZED_CODE:
             # Get a fresh access token
             headers = {'authorization': f'Bearer {REFRESH_TOKEN}'}
             response = await context.send_request(
@@ -38,7 +38,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:
             }
             context.request.headers = HttpHeaders(new_headers)
             # Trigger a retry with our updated headers
-            raise HttpStatusCodeError('Unauthorized', status_code=401)
+            raise HttpStatusCodeError('Unauthorized', status_code=UNAUTHORIZED_CODE)
 
     await crawler.run(['http://httpbingo.org/status/401'])
 

From f3e6616f0c1ad900e78f221ecaa5966c95665f8a Mon Sep 17 00:00:00 2001
From: Max Bohomolov <moriturus7@gmail.com>
Date: Wed, 23 Apr 2025 15:45:30 +0000
Subject: [PATCH 4/4] up example

---
 docs/guides/code_examples/error_handling/handle_proxy_error.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/code_examples/error_handling/handle_proxy_error.py b/docs/guides/code_examples/error_handling/handle_proxy_error.py
index c5c4d5dac9..eddb843fdd 100644
--- a/docs/guides/code_examples/error_handling/handle_proxy_error.py
+++ b/docs/guides/code_examples/error_handling/handle_proxy_error.py
@@ -8,7 +8,7 @@
 async def main() -> None:
     # Set how many session rotations will happen before calling the error handler
     # when ProxyError occurs
-    crawler = HttpCrawler(max_session_rotations=5)
+    crawler = HttpCrawler(max_session_rotations=5, max_request_retries=6)
 
     # For this example, we'll create a proxy error in our handler
     @crawler.router.default_handler