Skip to content

Commit 686662f

Browse files
committed
regex-based POC
Uses ua-parser/uap-rust#3 Fixes ua-parser#166
1 parent 022ab80 commit 686662f

File tree

9 files changed

+160
-53
lines changed

9 files changed

+160
-53
lines changed

.github/workflows/ci.yml

+9-21
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,8 @@ name: CI
22

33
on:
44
push:
5-
branches: [ '*' ]
65
pull_request:
7-
branches: [ '*' ]
86
workflow_dispatch:
9-
schedule:
10-
# cron is kinda random, assumes 22:00 UTC is a low ebb, eastern
11-
# countries are very early morning, and US are mid-day to
12-
# mid-afternoon
13-
- cron: '0 22 * * 2'
147

158
jobs:
169
checks:
@@ -88,19 +81,14 @@ jobs:
8881
- sdist
8982
- source
9083
python-version:
91-
- "3.8"
9284
- "3.9"
9385
- "3.10"
9486
- "3.11"
9587
- "3.12"
9688
- "3.13"
97-
- "pypy-3.8"
98-
- "pypy-3.9"
9989
- "pypy-3.10"
10090
# - "pypy-3.11"
101-
# don't enable graal because it's slower than even pypy and
102-
# fails because oracle/graalpython#385
103-
# - "graalpy-23"
91+
- "graalpy-24"
10492
include:
10593
- source: sdist
10694
artifact: dist/*.tar.gz
@@ -119,17 +107,17 @@ jobs:
119107
- name: Install test dependencies
120108
run: |
121109
python -mpip install --upgrade pip
122-
# cyaml is outright broken on pypy
123-
if ! ${{ startsWith(matrix.python-version, 'pypy-') }}; then
124-
# if binary wheels are not available for the current
125-
# package install libyaml-dev so we can install pyyaml
126-
# from source
127-
if ! pip download --only-binary pyyaml -rrequirements_dev.txt > /dev/null 2>&1; then
128-
sudo apt install libyaml-dev
129-
fi
110+
# if binary wheels are not available for the current
111+
# package install libyaml-dev so we can install pyyaml
112+
# from source
113+
if ! pip download --only-binary :all: pyyaml > /dev/null 2>&1; then
114+
sudo apt install libyaml-dev
130115
fi
131116
python -mpip install pytest pyyaml
132117
118+
# install rs accelerator if available, ignore if not
119+
python -mpip ua-parser-rs || true
120+
133121
# re2 is basically impossible to install from source so don't
134122
# bother, and suppress installation failure so the test does
135123
# not fail (re2 tests will just be skipped for versions /

doc/conf.py

+2
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919
rst_epilog = """
2020
.. |pyyaml| replace:: ``PyYaml``
2121
.. |re2| replace:: ``google-re2``
22+
.. |regex| replace:: ``regex``
2223
2324
.. _pyyaml: https://pyyaml.org
2425
.. _re2: https://pypi.org/project/google-re2
26+
.. _regex: https://pypi.org/project/ua-parser-rs
2527
"""
2628

2729
# -- General configuration ---------------------------------------------------

doc/installation.rst

+11-7
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@ Installation
55
Python Version
66
==============
77

8-
ua-parser currently supports Python 3.8 and newer, as well as recent
9-
versions of PyPy supporting the same standards.
8+
ua-parser currently supports CPython 3.9 and newer, recent Pypy
9+
(supporting 3.10), and Graal 24.
1010

11-
.. note:: While PyPy is supported, it is not *fast*, and google-re2 is
12-
not supported on it.
11+
.. note::
12+
13+
While pypy and graal are supported, they are rather slow when using
14+
pure python mode and ``[re2]`` is not supported, so using the
15+
``[regex]`` feature is very strongly recommended.
1316

1417
Installation
1518
============
@@ -21,13 +24,14 @@ Installation
2124
Optional Dependencies
2225
=====================
2326

24-
ua-parser currently has two optional dependencies, |re2|_ and
25-
|pyyaml|_. These dependencies will be detected and used automatically
27+
ua-parser currently has three optional dependencies, |regex|_, |re2|_ and
28+
|pyyaml|_. These dependencies will be detected and used augitomatically
2629
if installed, but can also be installed via and alongside ua-parser:
2730

2831
.. code-block:: sh
2932
33+
$ pip install 'ua-parser[regex]'
3034
$ pip install 'ua-parser[re2]'
3135
$ pip install 'ua-parser[yaml]'
32-
$ pip install 'ua-parser[re2,yaml]'
36+
$ pip install 'ua-parser[regex,yaml]'
3337

pyproject.toml

+10-5
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@ name = "ua-parser"
77
description = "Python port of Browserscope's user agent parser"
88
version = "1.0.0a1"
99
readme = "README.rst"
10-
requires-python = ">=3.8"
10+
requires-python = ">=3.9"
1111
dependencies = []
12-
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }
1312

1413
license = {text = "Apache 2.0"}
1514
urls = {repository = "https://github.com/ua-parser/uap-python"}
@@ -35,14 +34,20 @@ classifiers = [
3534
"Topic :: Internet :: WWW/HTTP",
3635
"Topic :: Software Development :: Libraries :: Python Modules",
3736
"Programming Language :: Python",
38-
"Programming Language :: Python :: 3.8",
3937
"Programming Language :: Python :: 3.9",
4038
"Programming Language :: Python :: 3.10",
4139
"Programming Language :: Python :: 3.11",
40+
"Programming Language :: Python :: 3.12",
4241
"Programming Language :: Python :: Implementation :: CPython",
43-
"Programming Language :: Python :: Implementation :: PyPy"
42+
"Programming Language :: Python :: Implementation :: PyPy",
43+
"Programming Language :: Python :: Implementation :: GraalPy",
4444
]
4545

46+
[project.optional-dependencies]
47+
yaml = ["PyYaml"]
48+
re2 = ["google-re2"]
49+
regex = ["ua-parser-rs"]
50+
4651
[tool.setuptools.packages.find]
4752
where = ["src"]
4853

@@ -63,7 +68,7 @@ known-first-party = ["ua_parser"]
6368
combine-as-imports = true
6469

6570
[tool.mypy]
66-
python_version = "3.8"
71+
python_version = "3.9"
6772
files = "src,tests"
6873

6974
# can't use strict because it's only global

setup.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,20 @@ def run(self) -> None:
6767
dest_lazy = outdir / "_lazy.py"
6868
dest_legacy = outdir / "_regexes.py"
6969

70-
with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open(
71-
"wb"
72-
) as legacy:
70+
with (
71+
dest.open("wb") as eager,
72+
dest_lazy.open("wb") as lazy,
73+
dest_legacy.open("wb") as legacy,
74+
):
7375
eager = EagerWriter(eager)
7476
lazy = LazyWriter(lazy)
7577
legacy = LegacyWriter(legacy)
7678

7779
for section in ["user_agent_parsers", "os_parsers", "device_parsers"]:
78-
with eager.section(section), lazy.section(section), legacy.section(
79-
section
80+
with (
81+
eager.section(section),
82+
lazy.section(section),
83+
legacy.section(section),
8084
):
8185
extract = EXTRACTORS[section]
8286
for p in regexes[section]:

src/ua_parser/__main__.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@
3939
from .caching import Cache, Local
4040
from .loaders import load_builtins, load_yaml
4141
from .re2 import Resolver as Re2Resolver
42+
from .regex import Resolver as RegexResolver
4243
from .user_agent_parser import Parse
4344

4445
CACHEABLE = {
4546
"basic": True,
4647
"re2": True,
48+
"regex": True,
4749
"legacy": False,
4850
}
4951

@@ -178,6 +180,8 @@ def get_parser(
178180
r = BasicResolver(rules)
179181
elif parser == "re2":
180182
r = Re2Resolver(rules)
183+
elif parser == "regex":
184+
r = RegexResolver(rules)
181185
else:
182186
sys.exit(f"unknown parser {parser!r}")
183187

@@ -327,6 +331,7 @@ def run_threaded(args: argparse.Namespace) -> None:
327331
("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))),
328332
("local-lru", CachingResolver(basic, Local(lambda: caching.Lru(CACHESIZE)))),
329333
("re2", Re2Resolver(load_builtins())),
334+
("regex", RegexResolver(load_builtins())),
330335
]
331336
for name, resolver in resolvers:
332337
print(f"{name:11}: ", end="", flush=True)
@@ -436,14 +441,14 @@ def __call__(
436441
bench.add_argument(
437442
"--bases",
438443
nargs="+",
439-
choices=["basic", "re2", "legacy"],
440-
default=["basic", "re2", "legacy"],
444+
choices=["basic", "re2", "regex", "legacy"],
445+
default=["basic", "re2", "regex", "legacy"],
441446
help="""Base resolvers to benchmark. `basic` is a linear search
442447
through the regexes file, `re2` is a prefiltered regex set
443-
implemented in C++, `legacy` is the legacy API (essentially a
444-
basic resolver with a clearing cache of fixed 200 entries, but
445-
less layered so usually slightly faster than an equivalent
446-
basic-based resolver).""",
448+
implemented in C++, `regex` is a prefiltered regex set implemented
449+
in Rust, `legacy` is the legacy API (essentially a basic resolver
450+
with a clearing cache of fixed 200 entries, but less layered so
451+
usually slightly faster than an equivalent basic-based resolver).""",
447452
)
448453
bench.add_argument(
449454
"--caches",

src/ua_parser/regex.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
__all__ = ["Resolver"]
2+
3+
from operator import attrgetter
4+
5+
import ua_parser_rs # type: ignore
6+
7+
from .core import (
8+
Device,
9+
Domain,
10+
Matchers,
11+
OS,
12+
PartialResult,
13+
UserAgent,
14+
)
15+
16+
17+
class Resolver:
18+
ua: ua_parser_rs.UserAgentExtractor
19+
os: ua_parser_rs.OSExtractor
20+
de: ua_parser_rs.DeviceExtractor
21+
22+
def __init__(self, matchers: Matchers) -> None:
23+
ua, os, de = matchers
24+
self.ua = ua_parser_rs.UserAgentExtractor(
25+
map(
26+
attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"),
27+
ua,
28+
)
29+
)
30+
self.os = ua_parser_rs.OSExtractor(
31+
map(
32+
attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"),
33+
os,
34+
)
35+
)
36+
self.de = ua_parser_rs.DeviceExtractor(
37+
map(
38+
attrgetter("regex", "regex_flag", "family", "brand", "model"),
39+
de,
40+
)
41+
)
42+
43+
def __call__(self, ua: str, domains: Domain, /) -> PartialResult:
44+
user_agent = os = device = None
45+
if Domain.USER_AGENT in domains:
46+
if m := self.ua.extract(ua):
47+
user_agent = UserAgent(
48+
m.family,
49+
m.major,
50+
m.minor,
51+
m.patch,
52+
m.patch_minor,
53+
)
54+
if Domain.OS in domains:
55+
if m := self.os.extract(ua):
56+
os = OS(
57+
m.family,
58+
m.major,
59+
m.minor,
60+
m.patch,
61+
m.patch_minor,
62+
)
63+
if Domain.DEVICE in domains:
64+
if m := self.de.extract(ua):
65+
device = Device(
66+
m.family,
67+
m.brand,
68+
m.model,
69+
)
70+
return PartialResult(
71+
domains=domains,
72+
string=ua,
73+
user_agent=user_agent,
74+
os=os,
75+
device=device,
76+
)

tests/test_core.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,19 @@
5353
else:
5454
PARSERS.append(pytest.param(Parser(re2.Resolver(load_builtins())), id="re2"))
5555

56+
try:
57+
from ua_parser import regex
58+
except ImportError:
59+
PARSERS.append(
60+
pytest.param(
61+
None,
62+
id="regex",
63+
marks=pytest.mark.skip(reason="regex parser not available"),
64+
)
65+
)
66+
else:
67+
PARSERS.append(pytest.param(Parser(regex.Resolver(load_builtins())), id="regex"))
68+
5669
UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}
5770

5871

@@ -64,7 +77,7 @@
6477
CORE_DIR / "test_resources" / "firefox_user_agent_strings.yaml",
6578
CORE_DIR / "test_resources" / "pgts_browser_list.yaml",
6679
],
67-
ids=attrgetter("name"),
80+
ids=attrgetter("stem"),
6881
)
6982
def test_ua(parser, test_file):
7083
with test_file.open("rb") as f:
@@ -90,7 +103,7 @@ def test_ua(parser, test_file):
90103
CORE_DIR / "tests" / "test_os.yaml",
91104
CORE_DIR / "test_resources" / "additional_os_tests.yaml",
92105
],
93-
ids=attrgetter("name"),
106+
ids=attrgetter("stem"),
94107
)
95108
def test_os(parser, test_file):
96109
with test_file.open("rb") as f:
@@ -111,7 +124,7 @@ def test_os(parser, test_file):
111124
[
112125
CORE_DIR / "tests" / "test_device.yaml",
113126
],
114-
ids=attrgetter("name"),
127+
ids=attrgetter("stem"),
115128
)
116129
def test_devices(parser, test_file):
117130
with test_file.open("rb") as f:

tox.ini

+16-6
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
[tox]
22
min_version = 4.0
3-
env_list = py3{8,9,10,11,12}
4-
pypy3.{8,9,10}
3+
env_list = py3{9,10,11,12}
4+
pypy3.10
5+
#graalpy-24
56
flake8, black, typecheck
67
labels =
7-
test = py3{8,9,10,11,12},pypy3.{8,9,10}
8-
cpy = py3{8,9,10,11,12}
9-
pypy = pypy3.{8,9,10}
8+
test = py3{9,10,11,12},pypy3.10,graalpy-24
9+
cpy = py3{9,10,11,12}
10+
pypy = pypy3.10
11+
#graal = graalpy-24
1012
check = flake8, black, typecheck
1113

1214
[testenv]
@@ -20,13 +22,21 @@ deps =
2022
pytest
2123
pyyaml
2224
google-re2
25+
ua-parser-rs
2326
commands =
2427
pytest -Werror --doctest-glob="*.rst" {posargs}
2528

26-
[testenv:pypy3.{8,9,10}]
29+
[testenv:pypy3.10]
2730
deps =
2831
pytest
2932
pyyaml
33+
ua-parser-rs
34+
35+
[testenv:graalpy-24]
36+
deps =
37+
pytest
38+
pyyaml
39+
ua-parser-rs
3040

3141
[testenv:flake8]
3242
package = skip

0 commit comments

Comments
 (0)