Skip to content

Commit 05f1d66

Browse files
authored
Custom memusage extension (#257)
* Custom extension to get memory usage * Ignore bandit warning * Simplify * Add handler attributes * Basic tests for memusage extension * Simplify extension and tests * More tests * Make pylint happy * Test get_virtual_size * More tests * Update readme * Mention package requirement * Rename module
1 parent 6156f70 commit 05f1d66

File tree

5 files changed

+180
-2
lines changed

5 files changed

+180
-2
lines changed

README.md

+26
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,32 @@ for a list of the accepted events and the arguments passed to their handlers.
826826
images, scripts, stylesheets, etc are not seen by Scrapy.
827827

828828

829+
## Memory usage extension
830+
831+
The default Scrapy memory usage extension
832+
(`scrapy.extensions.memusage.MemoryUsage`) does not include the memory used by
833+
Playwright because the browser is launched as a separate process. The
834+
scrapy-playwright package provides a replacement extension which also considers
835+
the memory used by Playwright. This extension needs the
836+
[`psutil`](https://pypi.org/project/psutil/) package to work.
837+
838+
Update the [EXTENSIONS](https://docs.scrapy.org/en/latest/topics/settings.html#std-setting-EXTENSIONS)
839+
setting to disable the built-in Scrapy extension and replace it with the one
840+
from the scrapy-playwright package:
841+
842+
```python
843+
# settings.py
844+
EXTENSIONS = {
845+
"scrapy.extensions.memusage.MemoryUsage": None,
846+
"scrapy_playwright.memusage.ScrapyPlaywrightMemoryUsageExtension": 0,
847+
}
848+
```
849+
850+
Refer to the
851+
[upstream docs](https://docs.scrapy.org/en/latest/topics/extensions.html#module-scrapy.extensions.memusage)
852+
for more information about supported settings.
853+
854+
829855
## Examples
830856

831857
**Click on a link, save the resulting page as PDF**

scrapy_playwright/handler.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
Download,
1313
Error as PlaywrightError,
1414
Page,
15+
Playwright as AsyncPlaywright,
1516
PlaywrightContextManager,
1617
Request as PlaywrightRequest,
1718
Response as PlaywrightResponse,
@@ -95,6 +96,9 @@ def from_settings(cls, settings: Settings) -> "Config":
9596

9697

9798
class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
99+
playwright_context_manager: Optional[PlaywrightContextManager] = None
100+
playwright: Optional[AsyncPlaywright] = None
101+
98102
def __init__(self, crawler: Crawler) -> None:
99103
super().__init__(settings=crawler.settings, crawler=crawler)
100104
verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
@@ -294,8 +298,10 @@ async def _close(self) -> None:
294298
if hasattr(self, "browser"):
295299
logger.info("Closing browser")
296300
await self.browser.close()
297-
await self.playwright_context_manager.__aexit__()
298-
await self.playwright.stop()
301+
if self.playwright_context_manager:
302+
await self.playwright_context_manager.__aexit__()
303+
if self.playwright:
304+
await self.playwright.stop()
299305

300306
def download_request(self, request: Request, spider: Spider) -> Deferred:
301307
if request.meta.get("playwright"):

scrapy_playwright/memusage.py

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from contextlib import suppress
2+
from importlib import import_module
3+
from typing import List
4+
5+
from scrapy.exceptions import NotConfigured
6+
from scrapy.extensions.memusage import MemoryUsage
7+
8+
from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler, logger
9+
10+
11+
_MIB_FACTOR = 1024**2
12+
13+
14+
class ScrapyPlaywrightMemoryUsageExtension(MemoryUsage):
15+
def __init__(self, *args, **kwargs) -> None:
16+
super().__init__(*args, **kwargs)
17+
try:
18+
self.psutil = import_module("psutil")
19+
except ImportError as exc:
20+
raise NotConfigured("The psutil module is not available") from exc
21+
22+
def _get_main_process_ids(self) -> List[int]:
23+
try:
24+
return [
25+
handler.playwright_context_manager._connection._transport._proc.pid
26+
for handler in self.crawler.engine.downloader.handlers._handlers.values()
27+
if isinstance(handler, ScrapyPlaywrightDownloadHandler)
28+
and handler.playwright_context_manager
29+
]
30+
except Exception:
31+
return []
32+
33+
def _get_descendant_processes(self, process) -> list:
34+
children = process.children()
35+
result = children.copy()
36+
for child in children:
37+
result.extend(self._get_descendant_processes(child))
38+
return result
39+
40+
def _get_total_playwright_process_memory(self) -> int:
41+
process_list = [self.psutil.Process(pid) for pid in self._get_main_process_ids()]
42+
for proc in process_list.copy():
43+
process_list.extend(self._get_descendant_processes(proc))
44+
total_process_size = 0
45+
for proc in process_list:
46+
with suppress(Exception): # might fail if the process exited in the meantime
47+
total_process_size += proc.memory_info().rss
48+
logger.debug(
49+
"Total Playwright process memory: %i Bytes (%i MiB)",
50+
total_process_size,
51+
total_process_size / _MIB_FACTOR,
52+
)
53+
return total_process_size
54+
55+
def get_virtual_size(self) -> int:
56+
return super().get_virtual_size() + self._get_total_playwright_process_memory()
+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
from asyncio.subprocess import Process as AsyncioProcess
2+
from unittest import IsolatedAsyncioTestCase
3+
from unittest.mock import MagicMock, patch
4+
5+
import pytest
6+
from playwright.async_api import PlaywrightContextManager
7+
from scrapy.exceptions import NotConfigured
8+
from scrapy.extensions.memusage import MemoryUsage
9+
10+
from scrapy_playwright.memusage import ScrapyPlaywrightMemoryUsageExtension
11+
from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
12+
13+
14+
SCHEMA_PID_MAP = {"http": 123, "https": 456}
15+
16+
17+
def mock_crawler_with_handlers() -> dict:
18+
handlers = {}
19+
for schema, pid in SCHEMA_PID_MAP.items():
20+
process = MagicMock()
21+
process.pid = pid
22+
handlers[schema] = MagicMock(spec=ScrapyPlaywrightDownloadHandler)
23+
handlers[schema].playwright_context_manager._connection._transport._proc = process
24+
crawler = MagicMock()
25+
crawler.engine.downloader.handlers._handlers = handlers
26+
return crawler
27+
28+
29+
def raise_import_error(*args, **kwargs):
30+
raise ImportError
31+
32+
33+
class MockMemoryInfo:
34+
rss = 999
35+
36+
37+
@patch("scrapy.extensions.memusage.MailSender")
38+
class TestMemoryUsageExtension(IsolatedAsyncioTestCase):
39+
async def test_process_availability(self, _MailSender):
40+
"""The main node process should be accessible from the context manager"""
41+
ctx_manager = PlaywrightContextManager()
42+
await ctx_manager.start()
43+
assert isinstance(ctx_manager._connection._transport._proc, AsyncioProcess)
44+
await ctx_manager.__aexit__()
45+
46+
@patch("scrapy_playwright.memusage.import_module", side_effect=raise_import_error)
47+
async def test_psutil_not_available_extension_disabled(self, _import_module, _MailSender):
48+
crawler = MagicMock()
49+
with pytest.raises(NotConfigured):
50+
ScrapyPlaywrightMemoryUsageExtension(crawler)
51+
52+
async def test_get_process_ids_ok(self, _MailSender):
53+
crawler = mock_crawler_with_handlers()
54+
extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
55+
assert extension._get_main_process_ids() == list(SCHEMA_PID_MAP.values())
56+
57+
async def test_get_process_ids_error(self, _MailSender):
58+
crawler = mock_crawler_with_handlers()
59+
crawler.engine.downloader.handlers._handlers = MagicMock()
60+
crawler.engine.downloader.handlers._handlers.values.side_effect = raise_import_error
61+
extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
62+
assert extension._get_main_process_ids() == []
63+
64+
async def test_get_descendant_processes(self, _MailSender):
65+
p1 = MagicMock()
66+
p2 = MagicMock()
67+
p3 = MagicMock()
68+
p4 = MagicMock()
69+
p2.children.return_value = [p3, p4]
70+
p1.children.return_value = [p2]
71+
crawler = MagicMock()
72+
extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
73+
assert extension._get_descendant_processes(p1) == [p2, p3, p4]
74+
75+
async def test_get_total_process_size(self, _MailSender):
76+
crawler = MagicMock()
77+
extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
78+
extension.psutil = MagicMock()
79+
extension.psutil.Process.return_value.memory_info.return_value = MockMemoryInfo()
80+
extension._get_main_process_ids = MagicMock(return_value=[1, 2, 3])
81+
expected_size = MockMemoryInfo().rss * len(extension._get_main_process_ids())
82+
assert extension._get_total_playwright_process_memory() == expected_size
83+
84+
async def test_get_virtual_size_sum(self, _MailSender):
85+
crawler = MagicMock()
86+
extension = ScrapyPlaywrightMemoryUsageExtension(crawler)
87+
parent_cls_extension = MemoryUsage(crawler)
88+
extension._get_total_playwright_process_memory = MagicMock(return_value=123)
89+
assert extension.get_virtual_size() == parent_cls_extension.get_virtual_size() + 123

tox.ini

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ deps =
66
pytest==7.4.0
77
pytest_cov==4.1.0
88
pytest_twisted==1.14
9+
psutil==5.9.7
910
commands =
1011
playwright install
1112
py.test -vv --reactor=asyncio \

0 commit comments

Comments
 (0)