Skip to content

Commit 24f2d8c

Browse files
authored
Move Scrapy-related code from Actor template to SDK (#134)
1 parent e238be4 commit 24f2d8c

22 files changed

+465
-36
lines changed

Diff for: .flake8

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ filename =
66
per-file-ignores =
77
scripts/*: D
88
tests/*: D
9+
**/__init__.py: F401
910

1011
# Google docstring convention + D204 & D401
1112
docstring-convention = all

Diff for: CHANGELOG.md

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
Changelog
22
=========
33

4-
[1.2.1](../../releases/tag/v1.2.1) - Unreleased
4+
[1.3.0](../../releases/tag/v1.3.0) - Unreleased
55
-----------------------------------------------
66

7-
...
7+
### Added
8+
9+
- Added `scrapy` extra
810

911
[1.2.0](../../releases/tag/v1.2.0) - 2023-10-23
1012
-----------------------------------------------

Diff for: Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ clean:
88

99
install-dev:
1010
python -m pip install --upgrade pip
11-
pip install --no-cache-dir -e ".[dev]"
11+
pip install --no-cache-dir -e ".[dev,scrapy]"
1212
pre-commit install
1313

1414
build:

Diff for: README.md

+16
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,22 @@ event handling.
77
If you just need to access the [Apify API](https://docs.apify.com/api/v2) from your Python applications,
88
check out the [Apify Client for Python](https://docs.apify.com/api/client/python) instead.
99

10+
## Installation
11+
12+
The Apify SDK for Python is available on PyPI as the `apify` package.
13+
For default installation, using Pip, run the following:
14+
15+
```bash
16+
pip install apify
17+
```
18+
19+
For users interested in integrating Apify with Scrapy, we provide a package extra called `scrapy`.
20+
To install Apify with the `scrapy` extra, use the following command:
21+
22+
```bash
23+
pip install apify[scrapy]
24+
```
25+
1026
## Documentation
1127

1228
For usage instructions, check the documentation on [Apify Docs](https://docs.apify.com/sdk/python/).

Diff for: mypy.ini

+6
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,9 @@ warn_redundant_casts = True
1414
warn_return_any = True
1515
warn_unreachable = True
1616
warn_unused_ignores = True
17+
18+
[mypy-scrapy.*]
19+
ignore_missing_imports = True
20+
21+
[mypy-sortedcollections.*]
22+
ignore_missing_imports = True

Diff for: pyproject.toml

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "apify"
3-
version = "1.2.1"
3+
version = "1.3.0"
44
description = "Apify SDK for Python"
55
readme = "README.md"
66
license = {text = "Apache Software License"}
@@ -72,6 +72,9 @@ dev = [
7272
"types-colorama ~= 0.4.15.11",
7373
"types-psutil ~= 5.9.5.12",
7474
]
75+
scrapy = [
76+
"scrapy ~= 2.11.0",
77+
]
7578

7679
[project.urls]
7780
"Homepage" = "https://docs.apify.com/sdk/python/"

Diff for: src/apify/_crypto.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict:
3030
Returns:
3131
disc: Encrypted password and value.
3232
"""
33-
key_bytes = _crypto_random_object_id(ENCRYPTION_KEY_LENGTH).encode('utf-8')
34-
initialized_vector_bytes = _crypto_random_object_id(ENCRYPTION_IV_LENGTH).encode('utf-8')
33+
key_bytes = crypto_random_object_id(ENCRYPTION_KEY_LENGTH).encode('utf-8')
34+
initialized_vector_bytes = crypto_random_object_id(ENCRYPTION_IV_LENGTH).encode('utf-8')
3535
value_bytes = value.encode('utf-8')
3636

3737
password_bytes = key_bytes + initialized_vector_bytes
@@ -122,7 +122,7 @@ def _load_public_key(public_key_file_base64: str) -> rsa.RSAPublicKey:
122122
return public_key
123123

124124

125-
def _crypto_random_object_id(length: int = 17) -> str:
125+
def crypto_random_object_id(length: int = 17) -> str:
126126
"""Python reimplementation of cryptoRandomObjectId from `@apify/utilities`."""
127127
chars = 'abcdefghijklmnopqrstuvwxyzABCEDFGHIJKLMNOPQRSTUVWXYZ0123456789'
128128
return ''.join(secrets.choice(chars) for _ in range(length))

Diff for: src/apify/_memory_storage/resource_clients/dataset.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from apify_shared.types import JSONSerializable
1111
from apify_shared.utils import ignore_docs
1212

13-
from ..._crypto import _crypto_random_object_id
13+
from ..._crypto import crypto_random_object_id
1414
from ..._utils import _force_rename, _raise_on_duplicate_storage, _raise_on_non_existing_storage
1515
from ...consts import _StorageTypes
1616
from ..file_storage_utils import _update_dataset_items, _update_metadata
@@ -52,7 +52,7 @@ def __init__(
5252
name: Optional[str] = None,
5353
) -> None:
5454
"""Initialize the DatasetClient."""
55-
self._id = id or _crypto_random_object_id()
55+
self._id = id or crypto_random_object_id()
5656
self._resource_directory = os.path.join(base_storage_directory, name or self._id)
5757
self._memory_storage_client = memory_storage_client
5858
self._name = name

Diff for: src/apify/_memory_storage/resource_clients/key_value_store.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from apify_shared.utils import ignore_docs, is_file_or_bytes, json_dumps
1717

18-
from ..._crypto import _crypto_random_object_id
18+
from ..._crypto import crypto_random_object_id
1919
from ..._utils import (
2020
_force_remove,
2121
_force_rename,
@@ -73,7 +73,7 @@ def __init__(
7373
name: Optional[str] = None,
7474
) -> None:
7575
"""Initialize the KeyValueStoreClient."""
76-
self._id = id or _crypto_random_object_id()
76+
self._id = id or crypto_random_object_id()
7777
self._resource_directory = os.path.join(base_storage_directory, name or self._id)
7878
self._memory_storage_client = memory_storage_client
7979
self._name = name

Diff for: src/apify/_memory_storage/resource_clients/request_queue.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@
66
from typing import TYPE_CHECKING, Dict, List, Optional
77

88
import aioshutil
9-
from sortedcollections import ValueSortedDict # type: ignore
9+
from sortedcollections import ValueSortedDict
1010

1111
from apify_shared.utils import filter_out_none_values_recursively, ignore_docs, json_dumps
1212

13-
from ..._crypto import _crypto_random_object_id
13+
from ..._crypto import crypto_random_object_id
1414
from ..._utils import _force_rename, _raise_on_duplicate_storage, _raise_on_non_existing_storage, _unique_key_to_request_id
1515
from ...consts import _StorageTypes
1616
from ..file_storage_utils import _delete_request, _update_metadata, _update_request_queue_item
@@ -46,7 +46,7 @@ def __init__(
4646
name: Optional[str] = None,
4747
) -> None:
4848
"""Initialize the RequestQueueClient."""
49-
self._id = id or _crypto_random_object_id()
49+
self._id = id or crypto_random_object_id()
5050
self._resource_directory = os.path.join(base_storage_directory, name or self._id)
5151
self._memory_storage_client = memory_storage_client
5252
self._name = name

Diff for: src/apify/scrapy/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .middlewares import ApifyRetryMiddleware
2+
from .pipelines import ActorDatasetPushPipeline
3+
from .scheduler import ApifyScheduler
4+
from .utils import get_running_event_loop_id, open_queue_with_custom_client, to_apify_request, to_scrapy_request

Diff for: src/apify/scrapy/middlewares.py

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import traceback
2+
from typing import Union
3+
4+
try:
5+
from scrapy import Spider
6+
from scrapy.downloadermiddlewares.retry import RetryMiddleware
7+
from scrapy.exceptions import IgnoreRequest
8+
from scrapy.http import Request, Response
9+
from scrapy.utils.response import response_status_message
10+
except ImportError as exc:
11+
raise ImportError(
12+
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
13+
) from exc
14+
15+
from ..actor import Actor
16+
from ..storages import RequestQueue
17+
from .utils import nested_event_loop, open_queue_with_custom_client, to_apify_request
18+
19+
20+
class ApifyRetryMiddleware(RetryMiddleware):
21+
"""The default Scrapy retry middleware enriched with Apify's Request Queue interaction."""
22+
23+
def __init__(self, *args: list, **kwargs: dict) -> None:
24+
"""Create a new instance."""
25+
super().__init__(*args, **kwargs)
26+
try:
27+
self._rq: RequestQueue = nested_event_loop.run_until_complete(open_queue_with_custom_client())
28+
except BaseException:
29+
traceback.print_exc()
30+
31+
def __del__(self) -> None:
32+
"""Before deleting the instance, close the nested event loop."""
33+
nested_event_loop.stop()
34+
nested_event_loop.close()
35+
36+
def process_response(self, request: Request, response: Response, spider: Spider) -> Union[Request, Response]:
37+
"""Process the response and decide whether the request should be retried.
38+
39+
Args:
40+
request: The request that was sent.
41+
response: The response that was received.
42+
spider: The Spider that sent the request.
43+
44+
Returns:
45+
The response, or a new request if the request should be retried.
46+
"""
47+
# Robots requests are bypassed directly, they don't go through a Scrapy Scheduler, and also through our
48+
# Request Queue. Check the scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware for details.
49+
assert isinstance(request.url, str)
50+
if request.url.endswith('robots.txt'):
51+
return response
52+
53+
try:
54+
returned = nested_event_loop.run_until_complete(self._handle_retry_logic(request, response, spider))
55+
except BaseException:
56+
traceback.print_exc()
57+
58+
return returned
59+
60+
def process_exception(
61+
self,
62+
request: Request,
63+
exception: BaseException,
64+
spider: Spider,
65+
) -> Union[Request, Response, None]:
66+
"""Handle the exception and decide whether the request should be retried."""
67+
Actor.log.debug(f'ApifyRetryMiddleware.process_exception was called (scrapy_request={request})...')
68+
apify_request = to_apify_request(request, spider=spider)
69+
70+
if isinstance(exception, IgnoreRequest):
71+
try:
72+
nested_event_loop.run_until_complete(self._rq.mark_request_as_handled(apify_request))
73+
except BaseException:
74+
traceback.print_exc()
75+
else:
76+
nested_event_loop.run_until_complete(self._rq.reclaim_request(apify_request))
77+
78+
return super().process_exception(request, exception, spider)
79+
80+
async def _handle_retry_logic(
81+
self,
82+
request: Request,
83+
response: Response,
84+
spider: Spider,
85+
) -> Union[Request, Response]:
86+
"""Handle the retry logic of the request."""
87+
Actor.log.debug(f'ApifyRetryMiddleware.handle_retry_logic was called (scrapy_request={request})...')
88+
apify_request = to_apify_request(request, spider=spider)
89+
90+
if request.meta.get('dont_retry', False):
91+
await self._rq.mark_request_as_handled(apify_request)
92+
return response
93+
94+
if response.status in self.retry_http_codes:
95+
await self._rq.reclaim_request(apify_request)
96+
reason = response_status_message(response.status)
97+
return self._retry(request, reason, spider) or response
98+
99+
await self._rq.mark_request_as_handled(apify_request)
100+
return response

Diff for: src/apify/scrapy/pipelines.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from itemadapter import ItemAdapter
2+
3+
try:
4+
from scrapy import Item, Spider
5+
except ImportError as exc:
6+
raise ImportError(
7+
'To use this module, you need to install the "scrapy" extra. Run "pip install apify[scrapy]".',
8+
) from exc
9+
10+
from ..actor import Actor
11+
12+
13+
class ActorDatasetPushPipeline:
14+
"""A Scrapy pipeline for pushing items to an Actor's default dataset.
15+
16+
This pipeline is designed to be enabled only when the Scrapy project is run as an Actor.
17+
"""
18+
19+
async def process_item(self, item: Item, spider: Spider) -> Item:
20+
"""Pushes the provided Scrapy item to the Actor's default dataset."""
21+
item_dict = ItemAdapter(item).asdict()
22+
Actor.log.debug(f'Pushing item={item_dict} produced by spider={spider} to the dataset.')
23+
await Actor.push_data(item_dict)
24+
return item

0 commit comments

Comments
 (0)