Skip to content

Commit 16dea18

Browse files
committed
Use <num> and <hexnum>. Fix recursive replacements
* Numbers occur often. Use a short placeholder * Some placeholders were corrupted by later replacements
1 parent c745c76 commit 16dea18

File tree

1 file changed

+19
-8
lines changed

1 file changed

+19
-8
lines changed

klp.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@
355355
"function": [r"\b(([\w\.]+\([^)]*\)))"],
356356
"gitcommit": [r"\b(([0-9a-fA-F]{7,40}))\b"],
357357
"hexcolor": [r"((#[0-9A-Fa-f]{6}))\b"],
358-
"hexnumber": [r"0x[0-9a-fA-F]+"],
358+
"hexnum": [r"0x[0-9a-fA-F]+"],
359359
"ipv4": [
360360
rf"(?:(?<=^)|(?<=[^0-9.]))" # Left boundary.
361361
rf"(?:" # Non-capturing group for the IPv4 address.
@@ -402,7 +402,7 @@
402402
r"\b(([0-9A-Fa-f]{4}\.){2}([0-9A-Fa-f]{4}))\b",
403403
],
404404
"md5": [r"\b(([a-fA-F0-9]{32}))\b"],
405-
"number": [r"[+-]?(?:\d*\.?\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?"],
405+
"num": [r"[+-]?(?:\d*\.?\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?"],
406406
"oauth": [r"\b((ya29\.[0-9A-Za-z_-]+))\b"],
407407
"path": [r"((^|(?<=[^./\w-]))(/[.\w-]+)+/?)"],
408408
"sha1": [r"\b(([a-fA-F0-9]{40}))\b"],
@@ -843,7 +843,7 @@ def extraction_function(s):
843843
"fqdn": "fully qualified domain name (FQDN)",
844844
"function": "function calls",
845845
"gitcommit": "git commit hash",
846-
"hexnumber": "hex number with 0x prefix",
846+
"hexnum": "hex number with 0x prefix",
847847
"hexcolor": "hex color code",
848848
"ipv4": "IPv4 address",
849849
"ipv4_port": "IPv4 address:port",
@@ -853,7 +853,7 @@ def extraction_function(s):
853853
"jwt": "JSON Web Token (JWT)",
854854
"mac": "MAC address",
855855
"md5": "MD5 hash",
856-
"number": "number (integer or float)",
856+
"num": "number (integer or float)",
857857
"path": "Unix file path",
858858
"oauth": "OAuth token",
859859
"sha1": "SHA-1 hash",
@@ -5255,8 +5255,8 @@ def get_patterns_for_level(level: str) -> Dict[str, str]:
52555255
"version", # Version strings starting with v/V
52565256
]
52575257
maximum = default + [
5258-
"hexnumber", # place this before "number", because it's more specific
5259-
"number",
5258+
"hexnum", # place this before "number", because it's more specific
5259+
"num", # place this before default not to mess with <ipv4>
52605260
]
52615261
patterns = {
52625262
"min": minimum,
@@ -5276,11 +5276,22 @@ def normalize_patterns(text: str, level: str, patterns: List[str]) -> str:
52765276

52775277
patterns += get_patterns_for_level(level)
52785278

5279-
for pattern_name in patterns:
5279+
temp_replacements = {}
5280+
5281+
# First pass: replace with temporary placeholders using unique markers
5282+
# so that parts of placeholders are not replaced by other placeholders
5283+
for i, pattern_name in enumerate(patterns):
52805284
placeholder = f"<{pattern_name}>"
52815285
if pattern_name in BUILTIN_REGEXES:
5286+
# Create a unique temporary marker
5287+
temp = chr(0xE000 + i) # Start at U+E000
5288+
temp_replacements[temp] = placeholder
52825289
for regex in BUILTIN_REGEXES[pattern_name]:
5283-
normalized = re.sub(regex, placeholder, normalized)
5290+
normalized = re.sub(regex, temp, normalized)
5291+
5292+
# Second pass: replace temporary placeholders with final ones
5293+
for temp, final in temp_replacements.items():
5294+
normalized = normalized.replace(temp, final)
52845295

52855296
return normalized
52865297

0 commit comments

Comments
 (0)