Skip to content

Commit 84ba393

Browse files
authored
Keyword arguments for PLAYWRIGHT_PROCESS_REQUEST_HEADERS, pass additional Request fields (#303)
* Keyword arguments for PLAYWRIGHT_PROCESS_REQUEST_HEADERS * Update docs for PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting * Update docs for PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting * Update tests for PLAYWRIGHT_PROCESS_REQUEST_HEADERS * Add comma * Update version in readme
1 parent 5b8cfd7 commit 84ba393

File tree

4 files changed

+134
-14
lines changed

4 files changed

+134
-14
lines changed

README.md

+40-3
Original file line numberDiff line numberDiff line change
@@ -288,12 +288,17 @@ default headers could be sent as well). Coroutine functions (`async def`) are su
288288
This will be called at least once for each Scrapy request, but it could be called additional times
289289
if Playwright generates more requests (e.g. to retrieve assets like images or scripts).
290290

291-
The function must return a `dict` object, and receives the following positional arguments:
291+
The function must return a `Dict[str, str]` object, and receives the following three **keyword** arguments:
292292

293293
```python
294-
- browser_type: str
294+
- browser_type_name: str
295295
- playwright_request: playwright.async_api.Request
296-
- scrapy_headers: scrapy.http.headers.Headers
296+
- scrapy_request_data: dict
297+
* method: str
298+
* url: str
299+
* headers: scrapy.http.headers.Headers
300+
* body: Optional[bytes]
301+
* encoding: str
297302
```
298303

299304
The default function (`scrapy_playwright.headers.use_scrapy_headers`) tries to
@@ -308,6 +313,38 @@ set by Playwright will be sent. Keep in mind that in this case, headers passed
308313
via the `Request.headers` attribute or set by Scrapy components are ignored
309314
(including cookies set via the `Request.cookies` attribute).
310315

316+
Example:
317+
```python
318+
async def custom_headers(
319+
*,
320+
browser_type_name: str,
321+
playwright_request: playwright.async_api.Request,
322+
scrapy_request_data: dict,
323+
) -> Dict[str, str]:
324+
headers = await playwright_request.all_headers()
325+
scrapy_headers = scrapy_request_data["headers"].to_unicode_dict()
326+
headers["Cookie"] = scrapy_headers.get("Cookie")
327+
return headers
328+
329+
PLAYWRIGHT_PROCESS_REQUEST_HEADERS = custom_headers
330+
```
331+
332+
#### Deprecated argument handling
333+
334+
In version 0.0.40 and earlier, arguments were passed to the function positionally,
335+
and only the Scrapy headers were passed instead of a dictionary with data about the
336+
Scrapy request.
337+
This is deprecated since version 0.0.41, and support for this way of handling arguments
338+
will eventually be removed in accordance with the [Deprecation policy](#deprecation-policy).
339+
340+
Passed arguments:
341+
```python
342+
- browser_type: str
343+
- playwright_request: playwright.async_api.Request
344+
- scrapy_headers: scrapy.http.headers.Headers
345+
```
346+
347+
Example:
311348
```python
312349
def custom_headers(
313350
browser_type: str,

scrapy_playwright/handler.py

+34-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import asyncio
2+
import inspect
23
import logging
34
import platform
5+
import warnings
46
from contextlib import suppress
57
from dataclasses import dataclass, field as dataclass_field
68
from ipaddress import ip_address
@@ -22,7 +24,7 @@
2224
from scrapy import Spider, signals
2325
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
2426
from scrapy.crawler import Crawler
25-
from scrapy.exceptions import NotSupported
27+
from scrapy.exceptions import NotSupported, ScrapyDeprecationWarning
2628
from scrapy.http import Request, Response
2729
from scrapy.http.headers import Headers
2830
from scrapy.responsetypes import responsetypes
@@ -698,10 +700,40 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)
698700

699701
if self.process_request_headers is None:
700702
final_headers = await playwright_request.all_headers()
703+
elif (sig := inspect.signature(self.process_request_headers)) and (
704+
"browser_type_name" in sig.parameters
705+
and "playwright_request" in sig.parameters
706+
and "scrapy_request_data" in sig.parameters
707+
):
708+
overrides["headers"] = final_headers = await _maybe_await(
709+
self.process_request_headers(
710+
browser_type_name=self.config.browser_type_name,
711+
playwright_request=playwright_request,
712+
scrapy_request_data={
713+
"method": method,
714+
"url": url,
715+
"headers": headers,
716+
"body": body,
717+
"encoding": encoding,
718+
},
719+
)
720+
)
701721
else:
722+
warnings.warn(
723+
"Accepting positional arguments in the function passed to the"
724+
" PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function"
725+
" should accept three (3) keyword arguments instead:"
726+
" browser_type_name: str,"
727+
" playwright_request: playwright.async_api.Request,"
728+
" scrapy_request_data: dict",
729+
category=ScrapyDeprecationWarning,
730+
stacklevel=1,
731+
)
702732
overrides["headers"] = final_headers = await _maybe_await(
703733
self.process_request_headers(
704-
self.config.browser_type_name, playwright_request, headers
734+
self.config.browser_type_name,
735+
playwright_request,
736+
headers,
705737
)
706738
)
707739

scrapy_playwright/headers.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,22 @@
33
Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
44
"""
55

6+
from typing import Dict
67
from urllib.parse import urlparse
78

89
from playwright.async_api import Request as PlaywrightRequest
9-
from scrapy.http.headers import Headers
1010

1111

1212
async def use_scrapy_headers(
13-
browser_type: str,
13+
*,
14+
browser_type_name: str,
1415
playwright_request: PlaywrightRequest,
15-
scrapy_headers: Headers,
16-
) -> dict:
16+
scrapy_request_data: dict,
17+
) -> Dict[str, str]:
1718
"""Scrapy headers take precedence over Playwright headers for navigation requests.
1819
For non-navigation requests, only User-Agent is taken from the Scrapy headers."""
1920

20-
scrapy_headers_str = scrapy_headers.to_unicode_dict()
21+
scrapy_headers_str = scrapy_request_data["headers"].to_unicode_dict()
2122
playwright_headers = await playwright_request.all_headers()
2223

2324
# Scrapy's user agent has priority over Playwright's
@@ -29,7 +30,7 @@ async def use_scrapy_headers(
2930
scrapy_headers_str.setdefault("referer", referer)
3031

3132
# otherwise it fails with playwright.helper.Error: NS_ERROR_NET_RESET
32-
if browser_type == "firefox":
33+
if browser_type_name == "firefox":
3334
scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc
3435

3536
return scrapy_headers_str

tests/tests_asyncio/test_headers.py

+53-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import json
2+
import logging
23
import platform
4+
import warnings
35
from unittest import IsolatedAsyncioTestCase
46

57
import pytest
@@ -10,6 +12,11 @@
1012

1113

1214
class MixinProcessHeadersTestCase:
15+
@pytest.fixture(autouse=True)
16+
def inject_fixtures(self, caplog):
17+
caplog.set_level(logging.DEBUG)
18+
self._caplog = caplog
19+
1320
@allow_windows
1421
async def test_user_agent(self):
1522
settings_dict = {
@@ -66,10 +73,14 @@ async def test_playwright_headers(self):
6673
assert b"asdf" not in req.headers
6774

6875
@allow_windows
69-
async def test_use_custom_headers(self):
76+
async def test_use_custom_headers_ok(self):
7077
"""Custom header processing function"""
7178

72-
async def important_headers(*_args, **_kwargs) -> dict:
79+
async def important_headers(
80+
browser_type_name, # pylint: disable=unused-argument
81+
playwright_request, # pylint: disable=unused-argument
82+
scrapy_request_data, # pylint: disable=unused-argument
83+
) -> dict:
7384
return {"foo": "bar"}
7485

7586
settings_dict = {
@@ -84,12 +95,51 @@ async def important_headers(*_args, **_kwargs) -> dict:
8495
meta={"playwright": True},
8596
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
8697
)
87-
resp = await handler._download_request(req, Spider("foo"))
98+
with warnings.catch_warnings(record=True) as warning_list:
99+
resp = await handler._download_request(req, Spider("foo"))
100+
assert not warning_list
101+
headers = json.loads(resp.css("pre::text").get())
102+
headers = {key.lower(): value for key, value in headers.items()}
103+
assert headers["foo"] == "bar"
104+
assert headers.get("user-agent") not in (self.browser_type, "foobar")
105+
assert "asdf" not in headers
106+
107+
@allow_windows
108+
async def test_use_custom_headers_deprecated_arg_handling(self):
109+
"""Custom header processing function that receives deprecated args"""
110+
111+
async def deprecated_args(
112+
browser_name, pw_req, headers # pylint: disable=unused-argument
113+
) -> dict:
114+
return {"foo": "bar"}
115+
116+
settings_dict = {
117+
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
118+
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
119+
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": deprecated_args,
120+
}
121+
async with make_handler(settings_dict) as handler:
122+
with MockServer() as server:
123+
req = Request(
124+
url=server.urljoin("/headers"),
125+
meta={"playwright": True},
126+
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
127+
)
128+
with warnings.catch_warnings(record=True) as warning_list:
129+
resp = await handler._download_request(req, Spider("foo"))
88130
headers = json.loads(resp.css("pre::text").get())
89131
headers = {key.lower(): value for key, value in headers.items()}
90132
assert headers["foo"] == "bar"
91133
assert headers.get("user-agent") not in (self.browser_type, "foobar")
92134
assert "asdf" not in headers
135+
assert str(warning_list[0].message) == (
136+
"Accepting positional arguments in the function passed to the"
137+
" PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function"
138+
" should accept three (3) keyword arguments instead:"
139+
" browser_type_name: str,"
140+
" playwright_request: playwright.async_api.Request,"
141+
" scrapy_request_data: dict"
142+
)
93143

94144

95145
class TestProcessHeadersChromium(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase):

0 commit comments

Comments
 (0)