Skip to content

Commit 9425507

Browse files
committed
Add "duration" pattern
Useful for normalization
1 parent 16dea18 commit 9425507

File tree

2 files changed

+131
-0
lines changed

2 files changed

+131
-0
lines changed

klp.py

+10
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,16 @@
314314
_octet = r"(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)"
315315

316316
BUILTIN_REGEXES = {
317+
"duration": [
318+
# Basic and decimal with units (common short forms)
319+
r"(?<![A-Za-z0-9.])(?:\d+(?:\.\d+)?)(?:us|ms|[smh])(?![A-Za-z0-9.])",
320+
# Written out units, including longer time spans
321+
r"(?<![A-Za-z0-9.])(?:\d+(?:\.\d+)?)(?:\s*(?:microsecond|millisecond|second|minute|hour|day|week|month|year)s?)(?![A-Za-z0-9.])",
322+
# Microseconds with μ
323+
r"(?<![A-Za-z0-9.])(?:\d+(?:\.\d+)?)(?:μs|µs)(?![A-Za-z0-9.])",
324+
# Combined units (2 or 3 parts) like 1h30m or 1h30m15s
325+
r"(?<![A-Za-z0-9.])(?:\d+h\d+m\d+s|\d+h\d+m|\d+h\d+s|\d+m\d+s)(?![A-Za-z0-9.])",
326+
],
317327
# https://www.regular-expressions.info/email.html
318328
"email": [r"\b(([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b))"],
319329
"err": [

tests/test_duration_regex.py

+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import re
2+
3+
# Duration patterns with stricter boundaries.
4+
DURATION_PATTERNS = [
5+
# Basic and decimal with units (common short forms)
6+
r"(?<![A-Za-z0-9.])(?:\d+(?:\.\d+)?)(?:us|ms|[smh])(?![A-Za-z0-9.])",
7+
# Written out units, including longer time spans
8+
r"(?<![A-Za-z0-9.])(?:\d+(?:\.\d+)?)(?:\s*(?:microsecond|millisecond|second|minute|hour|day|week|month|year)s?)(?![A-Za-z0-9.])",
9+
# Microseconds with μ
10+
r"(?<![A-Za-z0-9.])(?:\d+(?:\.\d+)?)(?:μs|µs)(?![A-Za-z0-9.])",
11+
# Combined units (2 or 3 parts) like 1h30m or 1h30m15s
12+
r"(?<![A-Za-z0-9.])(?:\d+h\d+m\d+s|\d+h\d+m|\d+h\d+s|\d+m\d+s)(?![A-Za-z0-9.])",
13+
]
14+
15+
16+
def is_duration(text: str) -> bool:
17+
"""Check if text contains a valid duration token."""
18+
return any(re.search(pattern, text) is not None for pattern in DURATION_PATTERNS)
19+
20+
21+
# Tests (unverändert):
22+
def test_basic_units():
23+
"""Test basic time units with whole numbers."""
24+
assert is_duration("500ms")
25+
assert is_duration("30s")
26+
assert is_duration("5m")
27+
assert is_duration("2h")
28+
assert not is_duration("3d") # Should not match
29+
assert not is_duration("4w") # Should not match
30+
31+
32+
def test_decimal_values():
33+
"""Test decimal values with units."""
34+
assert is_duration("1.5s")
35+
assert is_duration("0.5h")
36+
assert is_duration("2.75m")
37+
assert is_duration("0.001ms")
38+
assert not is_duration(".5") # No unit
39+
assert not is_duration("1.5") # No unit
40+
41+
42+
def test_microseconds():
43+
"""Test microsecond formats."""
44+
assert is_duration("500us")
45+
assert is_duration("500μs")
46+
assert is_duration("500µs")
47+
assert is_duration("0.5us")
48+
assert not is_duration("us")
49+
assert not is_duration("500u")
50+
51+
52+
def test_written_units():
53+
"""Test fully written time units."""
54+
assert is_duration("1 minute")
55+
assert is_duration("2 minutes")
56+
assert is_duration("1 hour")
57+
assert is_duration("2 hours")
58+
assert is_duration("1 day")
59+
assert is_duration("2 days")
60+
assert is_duration("1 week")
61+
assert is_duration("2 weeks")
62+
assert is_duration("1 month")
63+
assert is_duration("2 months")
64+
assert is_duration("1 year")
65+
assert is_duration("2 years")
66+
assert is_duration("1.5 hours")
67+
assert not is_duration("minute")
68+
assert not is_duration("hours")
69+
70+
71+
def test_written_units_no_space():
72+
"""Test written units without spaces."""
73+
assert is_duration("1minute")
74+
assert is_duration("2minutes")
75+
assert is_duration("1hour")
76+
assert is_duration("2hours")
77+
assert is_duration("1.5hours")
78+
79+
80+
def test_combined_units():
81+
"""Test combinations of units."""
82+
assert is_duration("1h30m")
83+
assert is_duration("1m30s")
84+
assert is_duration("1h30m15s")
85+
assert is_duration("1h15s") # Hour + seconds
86+
assert not is_duration("1h30") # Incomplete
87+
assert not is_duration("1d30m") # No 'd' in combinations
88+
assert not is_duration("30m1h") # Wrong order
89+
assert not is_duration("1h30m5") # Missing final unit
90+
91+
92+
def test_context():
93+
"""Test durations in context."""
94+
assert is_duration("Completed in 500ms")
95+
assert is_duration("Took 1.5 hours to complete")
96+
assert is_duration("Duration: 2h30m")
97+
98+
99+
def test_edge_cases():
100+
"""Test edge cases and potential false positives."""
101+
assert not is_duration("")
102+
assert not is_duration("ms")
103+
assert not is_duration("hour")
104+
assert not is_duration("1")
105+
assert not is_duration("1.5")
106+
assert not is_duration("500")
107+
assert not is_duration("1.5.0s") # Invalid number format
108+
assert not is_duration("1hrs") # Invalid unit combination
109+
assert not is_duration("millis") # Not a valid unit
110+
assert not is_duration("micros") # Not a valid unit
111+
assert not is_duration("1.h") # Invalid number format
112+
assert not is_duration("1..5h") # Invalid number format
113+
114+
115+
def test_invalid_combinations():
116+
"""Test invalid unit combinations and formats."""
117+
assert not is_duration("1h2d") # Invalid combination
118+
assert not is_duration("1m2months") # Mixed short/long
119+
assert not is_duration("1hour30") # Incomplete combination
120+
assert not is_duration("hour1") # Wrong order
121+
assert not is_duration("1 h 30 m") # Spaces in combination

0 commit comments

Comments
 (0)