Skip to content

Commit 7c11f79

Browse files
committed
regex-based POC
Uses ua-parser/uap-rust#3 Fixes ua-parser#166
1 parent 4d988a0 commit 7c11f79

File tree

5 files changed

+107
-7
lines changed

5 files changed

+107
-7
lines changed

pyproject.toml

+6-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ version = "1.0.0a1"
99
readme = "README.rst"
1010
requires-python = ">=3.8"
1111
dependencies = []
12-
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }
1312

1413
license = {text = "Apache 2.0"}
1514
urls = {repository = "https://github.com/ua-parser/uap-python"}
@@ -39,10 +38,16 @@ classifiers = [
3938
"Programming Language :: Python :: 3.9",
4039
"Programming Language :: Python :: 3.10",
4140
"Programming Language :: Python :: 3.11",
41+
"Programming Language :: Python :: 3.12",
4242
"Programming Language :: Python :: Implementation :: CPython",
4343
"Programming Language :: Python :: Implementation :: PyPy"
4444
]
4545

46+
[project.optional-dependencies]
47+
yaml = ["PyYaml"]
48+
re2 = ["google-re2"]
49+
regex = ["ua-parser-rs"]
50+
4651
[tool.ruff.lint]
4752
select = ["F", "E", "W", "I", "RET", "RUF", "PT"]
4853
ignore = [

src/ua_parser/__main__.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@
3939
from .caching import Cache, Local
4040
from .loaders import load_builtins, load_yaml
4141
from .re2 import Resolver as Re2Resolver
42+
from .regex import Resolver as RegexResolver
4243
from .user_agent_parser import Parse
4344

4445
CACHEABLE = {
4546
"basic": True,
4647
"re2": True,
48+
"regex": True,
4749
"legacy": False,
4850
}
4951

@@ -178,6 +180,8 @@ def get_parser(
178180
r = BasicResolver(rules)
179181
elif parser == "re2":
180182
r = Re2Resolver(rules)
183+
elif parser == "regex":
184+
r = RegexResolver(rules)
181185
else:
182186
sys.exit(f"unknown parser {parser!r}")
183187

@@ -327,6 +331,7 @@ def run_threaded(args: argparse.Namespace) -> None:
327331
("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))),
328332
("local-lru", CachingResolver(basic, Local(lambda: caching.Lru(CACHESIZE)))),
329333
("re2", Re2Resolver(load_builtins())),
334+
("regex", RegexResolver(load_builtins())),
330335
]
331336
for name, resolver in resolvers:
332337
print(f"{name:11}: ", end="", flush=True)
@@ -436,14 +441,14 @@ def __call__(
436441
bench.add_argument(
437442
"--bases",
438443
nargs="+",
439-
choices=["basic", "re2", "legacy"],
440-
default=["basic", "re2", "legacy"],
444+
choices=["basic", "re2", "regex", "legacy"],
445+
default=["basic", "re2", "regex", "legacy"],
441446
help="""Base resolvers to benchmark. `basic` is a linear search
442447
through the regexes file, `re2` is a prefiltered regex set
443-
implemented in C++, `legacy` is the legacy API (essentially a
444-
basic resolver with a clearing cache of fixed 200 entries, but
445-
less layered so usually slightly faster than an equivalent
446-
basic-based resolver).""",
448+
implemented in C++, `regex` is a prefiltered regex set implemented
449+
in Rust, `legacy` is the legacy API (essentially a basic resolver
450+
with a clearing cache of fixed 200 entries, but less layered so
451+
usually slightly faster than an equivalent basic-based resolver).""",
447452
)
448453
bench.add_argument(
449454
"--caches",

src/ua_parser/regex.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
__all__ = ["Resolver"]
2+
3+
from operator import attrgetter
4+
5+
import ua_parser_rs # type: ignore
6+
7+
from .core import (
8+
Device,
9+
Domain,
10+
Matchers,
11+
OS,
12+
PartialResult,
13+
UserAgent,
14+
)
15+
16+
17+
class Resolver:
18+
ua: ua_parser_rs.UserAgentExtractor
19+
os: ua_parser_rs.OSExtractor
20+
de: ua_parser_rs.DeviceExtractor
21+
22+
def __init__(self, matchers: Matchers) -> None:
23+
ua, os, de = matchers
24+
self.ua = ua_parser_rs.UserAgentExtractor(
25+
map(
26+
attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"),
27+
ua,
28+
)
29+
)
30+
self.os = ua_parser_rs.OSExtractor(
31+
map(
32+
attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"),
33+
os,
34+
)
35+
)
36+
self.de = ua_parser_rs.DeviceExtractor(
37+
map(
38+
attrgetter("regex", "regex_flag", "family", "brand", "model"),
39+
de,
40+
)
41+
)
42+
43+
def __call__(self, ua: str, domains: Domain, /) -> PartialResult:
44+
user_agent = os = device = None
45+
if Domain.USER_AGENT in domains:
46+
if m := self.ua.extract(ua):
47+
user_agent = UserAgent(
48+
m.family,
49+
m.major,
50+
m.minor,
51+
m.patch,
52+
m.patch_minor,
53+
)
54+
if Domain.OS in domains:
55+
if m := self.os.extract(ua):
56+
os = OS(
57+
m.os,
58+
m.major,
59+
m.minor,
60+
m.patch,
61+
m.patch_minor,
62+
)
63+
if Domain.DEVICE in domains:
64+
if m := self.de.extract(ua):
65+
device = Device(
66+
m.family,
67+
m.brand,
68+
m.model,
69+
)
70+
return PartialResult(
71+
domains=domains,
72+
string=ua,
73+
user_agent=user_agent,
74+
os=os,
75+
device=device,
76+
)

tests/test_core.py

+13
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,19 @@
5353
else:
5454
PARSERS.append(pytest.param(Parser(re2.Resolver(load_builtins())), id="re2"))
5555

56+
try:
57+
from ua_parser import regex
58+
except ImportError:
59+
PARSERS.append(
60+
pytest.param(
61+
None,
62+
id="regex",
63+
marks=pytest.mark.skip(reason="regex parser not available"),
64+
)
65+
)
66+
else:
67+
PARSERS.append(pytest.param(Parser(regex.Resolver(load_builtins())), id="regex"))
68+
5669
UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}
5770

5871

tox.ini

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ deps =
2020
pytest
2121
pyyaml
2222
google-re2
23+
ua-parser-rs
2324
commands =
2425
pytest -Werror --doctest-glob="*.rst" {posargs}
2526

0 commit comments

Comments
 (0)