355
355
"function" : [r"\b(([\w\.]+\([^)]*\)))" ],
356
356
"gitcommit" : [r"\b(([0-9a-fA-F]{7,40}))\b" ],
357
357
"hexcolor" : [r"((#[0-9A-Fa-f]{6}))\b" ],
358
+ "hexnumber" : [r"0x[0-9a-fA-F]+" ],
358
359
"ipv4" : [
359
360
rf"(?:(?<=^)|(?<=[^0-9.]))" # Left boundary.
360
361
rf"(?:" # Non-capturing group for the IPv4 address.
401
402
r"\b(([0-9A-Fa-f]{4}\.){2}([0-9A-Fa-f]{4}))\b" ,
402
403
],
403
404
"md5" : [r"\b(([a-fA-F0-9]{32}))\b" ],
405
+ "number" : [r"[+-]?(?:\d*\.?\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?" ],
404
406
"oauth" : [r"\b((ya29\.[0-9A-Za-z_-]+))\b" ],
405
407
"path" : [r"((^|(?<=[^./\w-]))(/[.\w-]+)+/?)" ],
406
408
"sha1" : [r"\b(([a-fA-F0-9]{40}))\b" ],
@@ -841,6 +843,7 @@ def extraction_function(s):
841
843
"fqdn" : "fully qualified domain name (FQDN)" ,
842
844
"function" : "function calls" ,
843
845
"gitcommit" : "git commit hash" ,
846
+ "hexnumber" : "hex number with 0x prefix" ,
844
847
"hexcolor" : "hex color code" ,
845
848
"ipv4" : "IPv4 address" ,
846
849
"ipv4_port" : "IPv4 address:port" ,
@@ -850,6 +853,7 @@ def extraction_function(s):
850
853
"jwt" : "JSON Web Token (JWT)" ,
851
854
"mac" : "MAC address" ,
852
855
"md5" : "MD5 hash" ,
856
+ "number" : "number (integer or float)" ,
853
857
"path" : "Unix file path" ,
854
858
"oauth" : "OAuth token" ,
855
859
"sha1" : "SHA-1 hash" ,
@@ -3809,6 +3813,19 @@ def quoting_type(text):
3809
3813
}.get (text , None )
3810
3814
3811
3815
3816
+ def csv_of_choices (choices ):
3817
+ def validate (value ):
3818
+ items = value .split ("," ) # Convert input into a list
3819
+ for item in items :
3820
+ if item not in choices :
3821
+ raise argparse .ArgumentTypeError (
3822
+ f"Invalid choice: { item } . Allowed: { choices } "
3823
+ )
3824
+ return items # Return the validated list
3825
+
3826
+ return validate
3827
+
3828
+
3812
3829
def csv_type (text ):
3813
3830
return [] if text is None else text .split ("," )
3814
3831
@@ -4458,6 +4475,19 @@ def parse_args():
4458
4475
help = "replace common patterns with placeholders and deduplicate events" ,
4459
4476
)
4460
4477
4478
+ processing .add_argument (
4479
+ "--normalize-level" ,
4480
+ choices = ["min" , "default" , "max" ],
4481
+ help = "level of normalization (min, default, max)" ,
4482
+ )
4483
+ processing .add_argument (
4484
+ "--normalize-patterns" ,
4485
+ metavar = "PATTERNS" ,
4486
+ type = csv_of_choices (sorted (list (BUILTIN_REGEXES ) + ["timestamp" ])),
4487
+ default = [],
4488
+ help = "comma-separated list of builtin regex patterns (or 'timestamp') to use for normalization" ,
4489
+ )
4490
+
4461
4491
output = parser .add_argument_group ("output format options" )
4462
4492
output .add_argument (
4463
4493
"--output-format" ,
@@ -4835,6 +4865,10 @@ def parse_args():
4835
4865
query = f"CREATE TABLE IF NOT EXISTS { args .output_tablename } ({ columns } )"
4836
4866
args .cursor .execute (query )
4837
4867
4868
+ # Allow --normalize to be used alone
4869
+ if args .normalize and not (args .normalize_level or args .normalize_patterns ):
4870
+ args .normalize_level = "default"
4871
+
4838
4872
args .add_ts = "_klp_ts" in args .keys
4839
4873
args .add_ts_delta = "_klp_timedelta" in args .keys
4840
4874
@@ -5192,40 +5226,68 @@ def matches_any_pattern(line, patterns):
5192
5226
yield current_block , start_line_number , i
5193
5227
5194
5228
5195
- def normalize_patterns (text : str ) -> str :
5229
+ def get_patterns_for_level (level : str ) -> Dict [str , str ]:
5230
+
5231
+ minimum = [
5232
+ "uuid" ,
5233
+ "md5" ,
5234
+ "sha1" ,
5235
+ "sha256" ,
5236
+ "oauth" ,
5237
+ "hexcolor" ,
5238
+ ]
5239
+ default = [
5240
+ "ipv4_port" , # With colon and port number, test before ipv4 alone
5241
+ "ipv4" , # Very reliable, clear format with dot-separated octets
5242
+ "ipv6" , # Clear format with colons/hex digits
5243
+ "email" , # Standard format with @ and domain
5244
+ "uuid" , # Fixed format with specific dash positions
5245
+ "mac" , # Clear format with : or . separators
5246
+ "url" , # Reliable due to protocol prefix (http://, etc)
5247
+ "fqdn" , # Multi-part domain names
5248
+ "md5" , # Exactly 32 hex chars
5249
+ "sha1" , # Exactly 40 hex chars
5250
+ "sha256" , # Exactly 64 hex chars
5251
+ "path" , # Unix paths starting with /
5252
+ "oauth" , # Google OAuth token format
5253
+ "function" , # Function calls with parentheses
5254
+ "hexcolor" , # CSS/HTML hex color codes
5255
+ "version" , # Version strings starting with v/V
5256
+ ]
5257
+ maximum = default + [
5258
+ "hexnumber" , # place this before "number", because it's more specific
5259
+ "number" ,
5260
+ ]
5261
+ patterns = {
5262
+ "min" : minimum ,
5263
+ "default" : default ,
5264
+ "max" : maximum ,
5265
+ }
5266
+ result = patterns .get (level , [])
5267
+ return result
5268
+
5269
+
5270
+ def normalize_patterns (text : str , level : str , patterns : List [str ]) -> str :
5196
5271
"""Replace common patterns with placeholders and return normalized text."""
5197
5272
normalized = text
5198
5273
5199
5274
# Use existing BUILTIN_REGEXES and add placeholder markers
5200
5275
# Only normalize the most reliable and specific patterns
5201
- replacements = {
5202
- "ipv4_port" : "<ipv4_port>" , # With colon and port number, test before ipv4 alone
5203
- "ipv4" : "<ipv4>" , # Very reliable, clear format with dot-separated octets
5204
- "ipv6" : "<ipv6>" , # Clear format with colons/hex digits
5205
- "email" : "<email>" , # Standard format with @ and domain
5206
- "uuid" : "<uuid>" , # Fixed format with specific dash positions
5207
- "mac" : "<mac>" , # Clear format with : or . separators
5208
- "url" : "<url>" , # Reliable due to protocol prefix (http://, etc)
5209
- "fqdn" : "<fqdn>" , # Multi-part domain names
5210
- "md5" : "<md5>" , # Exactly 32 hex chars
5211
- "sha1" : "<sha1>" , # Exactly 40 hex chars
5212
- "sha256" : "<sha256>" , # Exactly 64 hex chars
5213
- "path" : "<path>" , # Unix paths starting with /
5214
- "oauth" : "<oauth>" , # Google OAuth token format
5215
- "function" : "<function>" , # Function calls with parentheses
5216
- "hexcolor" : "<hexcolor>" , # CSS/HTML hex color codes
5217
- "version" : "<version>" , # Version strings starting with v/V
5218
- }
5219
5276
5220
- for pattern_name , placeholder in replacements .items ():
5277
+ patterns += get_patterns_for_level (level )
5278
+
5279
+ for pattern_name in patterns :
5280
+ placeholder = f"<{ pattern_name } >"
5221
5281
if pattern_name in BUILTIN_REGEXES :
5222
5282
for regex in BUILTIN_REGEXES [pattern_name ]:
5223
5283
normalized = re .sub (regex , placeholder , normalized )
5224
5284
5225
5285
return normalized
5226
5286
5227
5287
5228
- def normalize_and_deduplicate_event (event : Dict [str , Any ]) -> Optional [Dict [str , Any ]]:
5288
+ def normalize_and_deduplicate_event (
5289
+ event : Dict [str , Any ], level : str , patterns : List [str ]
5290
+ ) -> Optional [Dict [str , Any ]]:
5229
5291
"""Process a single event, normalizing patterns and checking for duplicates."""
5230
5292
global _seen_normalized_events
5231
5293
@@ -5239,15 +5301,22 @@ def normalize_and_deduplicate_event(event: Dict[str, Any]) -> Optional[Dict[str,
5239
5301
for key , value in filtered_event .items ():
5240
5302
if isinstance (value , str ):
5241
5303
# Special handling for timestamp keys
5242
- if key .lower () in TS_KEYS or (args .ts_key and key == args .ts_key ):
5304
+ normalize_timestamp = "timestamp" in patterns or level in (
5305
+ "default" ,
5306
+ "max" ,
5307
+ )
5308
+ is_timestamp_key = key .lower () in TS_KEYS or (
5309
+ args .ts_key and key == args .ts_key
5310
+ )
5311
+ if normalize_timestamp and is_timestamp_key :
5243
5312
try :
5244
5313
if guess_datetime (value ) is not None :
5245
5314
normalized_event [key ] = "<timestamp>"
5246
5315
continue
5247
5316
except (ValueError , TypeError ):
5248
5317
pass
5249
5318
# Other patterns for non-timestamp fields
5250
- normalized_event [key ] = normalize_patterns (value )
5319
+ normalized_event [key ] = normalize_patterns (value , level , patterns )
5251
5320
else :
5252
5321
normalized_event [key ] = value
5253
5322
@@ -5264,15 +5333,17 @@ def normalize_and_deduplicate_event(event: Dict[str, Any]) -> Optional[Dict[str,
5264
5333
return None
5265
5334
5266
5335
5267
- def visible_with_normalize (event : Dict [str , Any ]) -> bool :
5336
+ def visible_with_normalize (
5337
+ event : Dict [str , Any ], level : str , patterns : List [str ]
5338
+ ) -> bool :
5268
5339
"""Extended visibility check including pattern normalization."""
5269
- if not args . normalize :
5340
+ if not ( level or patterns ) :
5270
5341
return visible (event )
5271
5342
5272
5343
if not visible (event ):
5273
5344
return False
5274
5345
5275
- return normalize_and_deduplicate_event (event ) is not None
5346
+ return normalize_and_deduplicate_event (event , level , patterns ) is not None
5276
5347
5277
5348
5278
5349
def events_from_linebased (
@@ -5892,7 +5963,9 @@ def main():
5892
5963
5893
5964
if args .add_ts :
5894
5965
event ["_klp_ts" ] = now_rfc3339 ()
5895
- if visible_with_normalize (event ):
5966
+ if visible_with_normalize (
5967
+ event , args .normalize_level , args .normalize_patterns
5968
+ ):
5896
5969
if args .fuse is not None or args .mark_gaps is not None :
5897
5970
ts_datetime = get_timestamp_datetime (event )
5898
5971
if ts_datetime is None :
0 commit comments