355
355
"function" : [r"\b(([\w\.]+\([^)]*\)))" ],
356
356
"gitcommit" : [r"\b(([0-9a-fA-F]{7,40}))\b" ],
357
357
"hexcolor" : [r"((#[0-9A-Fa-f]{6}))\b" ],
358
- "hexnumber " : [r"0x[0-9a-fA-F]+" ],
358
+ "hexnum " : [r"0x[0-9a-fA-F]+" ],
359
359
"ipv4" : [
360
360
rf"(?:(?<=^)|(?<=[^0-9.]))" # Left boundary.
361
361
rf"(?:" # Non-capturing group for the IPv4 address.
402
402
r"\b(([0-9A-Fa-f]{4}\.){2}([0-9A-Fa-f]{4}))\b" ,
403
403
],
404
404
"md5" : [r"\b(([a-fA-F0-9]{32}))\b" ],
405
- "number " : [r"[+-]?(?:\d*\.?\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?" ],
405
+ "num " : [r"[+-]?(?:\d*\.?\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?" ],
406
406
"oauth" : [r"\b((ya29\.[0-9A-Za-z_-]+))\b" ],
407
407
"path" : [r"((^|(?<=[^./\w-]))(/[.\w-]+)+/?)" ],
408
408
"sha1" : [r"\b(([a-fA-F0-9]{40}))\b" ],
@@ -843,7 +843,7 @@ def extraction_function(s):
843
843
"fqdn" : "fully qualified domain name (FQDN)" ,
844
844
"function" : "function calls" ,
845
845
"gitcommit" : "git commit hash" ,
846
- "hexnumber " : "hex number with 0x prefix" ,
846
+ "hexnum " : "hex number with 0x prefix" ,
847
847
"hexcolor" : "hex color code" ,
848
848
"ipv4" : "IPv4 address" ,
849
849
"ipv4_port" : "IPv4 address:port" ,
@@ -853,7 +853,7 @@ def extraction_function(s):
853
853
"jwt" : "JSON Web Token (JWT)" ,
854
854
"mac" : "MAC address" ,
855
855
"md5" : "MD5 hash" ,
856
- "number " : "number (integer or float)" ,
856
+ "num " : "number (integer or float)" ,
857
857
"path" : "Unix file path" ,
858
858
"oauth" : "OAuth token" ,
859
859
"sha1" : "SHA-1 hash" ,
@@ -5255,8 +5255,8 @@ def get_patterns_for_level(level: str) -> Dict[str, str]:
5255
5255
"version" , # Version strings starting with v/V
5256
5256
]
5257
5257
maximum = default + [
5258
- "hexnumber " , # place this before "number", because it's more specific
5259
- "number" ,
5258
+ "hexnum " , # place this before "number", because it's more specific
5259
+ "num" , # place this before default not to mess with <ipv4>
5260
5260
]
5261
5261
patterns = {
5262
5262
"min" : minimum ,
@@ -5276,11 +5276,22 @@ def normalize_patterns(text: str, level: str, patterns: List[str]) -> str:
5276
5276
5277
5277
patterns += get_patterns_for_level (level )
5278
5278
5279
- for pattern_name in patterns :
5279
+ temp_replacements = {}
5280
+
5281
+ # First pass: replace with temporary placeholders using unique markers
5282
+ # so that parts of placeholders are not replaced by other placeholders
5283
+ for i , pattern_name in enumerate (patterns ):
5280
5284
placeholder = f"<{ pattern_name } >"
5281
5285
if pattern_name in BUILTIN_REGEXES :
5286
+ # Create a unique temporary marker
5287
+ temp = chr (0xE000 + i ) # Start at U+E000
5288
+ temp_replacements [temp ] = placeholder
5282
5289
for regex in BUILTIN_REGEXES [pattern_name ]:
5283
- normalized = re .sub (regex , placeholder , normalized )
5290
+ normalized = re .sub (regex , temp , normalized )
5291
+
5292
+ # Second pass: replace temporary placeholders with final ones
5293
+ for temp , final in temp_replacements .items ():
5294
+ normalized = normalized .replace (temp , final )
5284
5295
5285
5296
return normalized
5286
5297
0 commit comments