Skip to content

Commit c745c76

Browse files
committed
Add --normalize-level, --normalize-patterns
More options to choose the regex patterns for normalization: Three levels right now: min, default, max - min: uuid, md5, sha1, sha256, oauth, hexcolor - default is like before - max: default, hexnumber (with 0x prefix), number (int or float) Using only --normalize activates the default level. Patterns can be explicitly given with --normalize-patterns. These are added to the patterns that may have been given by level. Timestamps are normalized on level default and above, or if "timestamp" is included in --normalize-pattern.
1 parent 38e4350 commit c745c76

File tree

1 file changed

+100
-27
lines changed

1 file changed

+100
-27
lines changed

klp.py

+100-27
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,7 @@
355355
"function": [r"\b(([\w\.]+\([^)]*\)))"],
356356
"gitcommit": [r"\b(([0-9a-fA-F]{7,40}))\b"],
357357
"hexcolor": [r"((#[0-9A-Fa-f]{6}))\b"],
358+
"hexnumber": [r"0x[0-9a-fA-F]+"],
358359
"ipv4": [
359360
rf"(?:(?<=^)|(?<=[^0-9.]))" # Left boundary.
360361
rf"(?:" # Non-capturing group for the IPv4 address.
@@ -401,6 +402,7 @@
401402
r"\b(([0-9A-Fa-f]{4}\.){2}([0-9A-Fa-f]{4}))\b",
402403
],
403404
"md5": [r"\b(([a-fA-F0-9]{32}))\b"],
405+
"number": [r"[+-]?(?:\d*\.?\d+|\d+\.?\d*)(?:[eE][+-]?\d+)?"],
404406
"oauth": [r"\b((ya29\.[0-9A-Za-z_-]+))\b"],
405407
"path": [r"((^|(?<=[^./\w-]))(/[.\w-]+)+/?)"],
406408
"sha1": [r"\b(([a-fA-F0-9]{40}))\b"],
@@ -841,6 +843,7 @@ def extraction_function(s):
841843
"fqdn": "fully qualified domain name (FQDN)",
842844
"function": "function calls",
843845
"gitcommit": "git commit hash",
846+
"hexnumber": "hex number with 0x prefix",
844847
"hexcolor": "hex color code",
845848
"ipv4": "IPv4 address",
846849
"ipv4_port": "IPv4 address:port",
@@ -850,6 +853,7 @@ def extraction_function(s):
850853
"jwt": "JSON Web Token (JWT)",
851854
"mac": "MAC address",
852855
"md5": "MD5 hash",
856+
"number": "number (integer or float)",
853857
"path": "Unix file path",
854858
"oauth": "OAuth token",
855859
"sha1": "SHA-1 hash",
@@ -3809,6 +3813,19 @@ def quoting_type(text):
38093813
}.get(text, None)
38103814

38113815

3816+
def csv_of_choices(choices):
3817+
def validate(value):
3818+
items = value.split(",") # Convert input into a list
3819+
for item in items:
3820+
if item not in choices:
3821+
raise argparse.ArgumentTypeError(
3822+
f"Invalid choice: {item}. Allowed: {choices}"
3823+
)
3824+
return items # Return the validated list
3825+
3826+
return validate
3827+
3828+
38123829
def csv_type(text):
38133830
return [] if text is None else text.split(",")
38143831

@@ -4458,6 +4475,19 @@ def parse_args():
44584475
help="replace common patterns with placeholders and deduplicate events",
44594476
)
44604477

4478+
processing.add_argument(
4479+
"--normalize-level",
4480+
choices=["min", "default", "max"],
4481+
help="level of normalization (min, default, max)",
4482+
)
4483+
processing.add_argument(
4484+
"--normalize-patterns",
4485+
metavar="PATTERNS",
4486+
type=csv_of_choices(sorted(list(BUILTIN_REGEXES) + ["timestamp"])),
4487+
default=[],
4488+
help="comma-separated list of builtin regex patterns (or 'timestamp') to use for normalization",
4489+
)
4490+
44614491
output = parser.add_argument_group("output format options")
44624492
output.add_argument(
44634493
"--output-format",
@@ -4835,6 +4865,10 @@ def parse_args():
48354865
query = f"CREATE TABLE IF NOT EXISTS {args.output_tablename} ({columns})"
48364866
args.cursor.execute(query)
48374867

4868+
# Allow --normalize to be used alone
4869+
if args.normalize and not (args.normalize_level or args.normalize_patterns):
4870+
args.normalize_level = "default"
4871+
48384872
args.add_ts = "_klp_ts" in args.keys
48394873
args.add_ts_delta = "_klp_timedelta" in args.keys
48404874

@@ -5192,40 +5226,68 @@ def matches_any_pattern(line, patterns):
51925226
yield current_block, start_line_number, i
51935227

51945228

5195-
def normalize_patterns(text: str) -> str:
5229+
def get_patterns_for_level(level: str) -> Dict[str, str]:
5230+
5231+
minimum = [
5232+
"uuid",
5233+
"md5",
5234+
"sha1",
5235+
"sha256",
5236+
"oauth",
5237+
"hexcolor",
5238+
]
5239+
default = [
5240+
"ipv4_port", # With colon and port number, test before ipv4 alone
5241+
"ipv4", # Very reliable, clear format with dot-separated octets
5242+
"ipv6", # Clear format with colons/hex digits
5243+
"email", # Standard format with @ and domain
5244+
"uuid", # Fixed format with specific dash positions
5245+
"mac", # Clear format with : or . separators
5246+
"url", # Reliable due to protocol prefix (http://, etc)
5247+
"fqdn", # Multi-part domain names
5248+
"md5", # Exactly 32 hex chars
5249+
"sha1", # Exactly 40 hex chars
5250+
"sha256", # Exactly 64 hex chars
5251+
"path", # Unix paths starting with /
5252+
"oauth", # Google OAuth token format
5253+
"function", # Function calls with parentheses
5254+
"hexcolor", # CSS/HTML hex color codes
5255+
"version", # Version strings starting with v/V
5256+
]
5257+
maximum = default + [
5258+
"hexnumber", # place this before "number", because it's more specific
5259+
"number",
5260+
]
5261+
patterns = {
5262+
"min": minimum,
5263+
"default": default,
5264+
"max": maximum,
5265+
}
5266+
result = patterns.get(level, [])
5267+
return result
5268+
5269+
5270+
def normalize_patterns(text: str, level: str, patterns: List[str]) -> str:
51965271
"""Replace common patterns with placeholders and return normalized text."""
51975272
normalized = text
51985273

51995274
# Use existing BUILTIN_REGEXES and add placeholder markers
52005275
# Only normalize the most reliable and specific patterns
5201-
replacements = {
5202-
"ipv4_port": "<ipv4_port>", # With colon and port number, test before ipv4 alone
5203-
"ipv4": "<ipv4>", # Very reliable, clear format with dot-separated octets
5204-
"ipv6": "<ipv6>", # Clear format with colons/hex digits
5205-
"email": "<email>", # Standard format with @ and domain
5206-
"uuid": "<uuid>", # Fixed format with specific dash positions
5207-
"mac": "<mac>", # Clear format with : or . separators
5208-
"url": "<url>", # Reliable due to protocol prefix (http://, etc)
5209-
"fqdn": "<fqdn>", # Multi-part domain names
5210-
"md5": "<md5>", # Exactly 32 hex chars
5211-
"sha1": "<sha1>", # Exactly 40 hex chars
5212-
"sha256": "<sha256>", # Exactly 64 hex chars
5213-
"path": "<path>", # Unix paths starting with /
5214-
"oauth": "<oauth>", # Google OAuth token format
5215-
"function": "<function>", # Function calls with parentheses
5216-
"hexcolor": "<hexcolor>", # CSS/HTML hex color codes
5217-
"version": "<version>", # Version strings starting with v/V
5218-
}
52195276

5220-
for pattern_name, placeholder in replacements.items():
5277+
patterns += get_patterns_for_level(level)
5278+
5279+
for pattern_name in patterns:
5280+
placeholder = f"<{pattern_name}>"
52215281
if pattern_name in BUILTIN_REGEXES:
52225282
for regex in BUILTIN_REGEXES[pattern_name]:
52235283
normalized = re.sub(regex, placeholder, normalized)
52245284

52255285
return normalized
52265286

52275287

5228-
def normalize_and_deduplicate_event(event: Dict[str, Any]) -> Optional[Dict[str, Any]]:
5288+
def normalize_and_deduplicate_event(
5289+
event: Dict[str, Any], level: str, patterns: List[str]
5290+
) -> Optional[Dict[str, Any]]:
52295291
"""Process a single event, normalizing patterns and checking for duplicates."""
52305292
global _seen_normalized_events
52315293

@@ -5239,15 +5301,22 @@ def normalize_and_deduplicate_event(event: Dict[str, Any]) -> Optional[Dict[str,
52395301
for key, value in filtered_event.items():
52405302
if isinstance(value, str):
52415303
# Special handling for timestamp keys
5242-
if key.lower() in TS_KEYS or (args.ts_key and key == args.ts_key):
5304+
normalize_timestamp = "timestamp" in patterns or level in (
5305+
"default",
5306+
"max",
5307+
)
5308+
is_timestamp_key = key.lower() in TS_KEYS or (
5309+
args.ts_key and key == args.ts_key
5310+
)
5311+
if normalize_timestamp and is_timestamp_key:
52435312
try:
52445313
if guess_datetime(value) is not None:
52455314
normalized_event[key] = "<timestamp>"
52465315
continue
52475316
except (ValueError, TypeError):
52485317
pass
52495318
# Other patterns for non-timestamp fields
5250-
normalized_event[key] = normalize_patterns(value)
5319+
normalized_event[key] = normalize_patterns(value, level, patterns)
52515320
else:
52525321
normalized_event[key] = value
52535322

@@ -5264,15 +5333,17 @@ def normalize_and_deduplicate_event(event: Dict[str, Any]) -> Optional[Dict[str,
52645333
return None
52655334

52665335

5267-
def visible_with_normalize(event: Dict[str, Any]) -> bool:
5336+
def visible_with_normalize(
5337+
event: Dict[str, Any], level: str, patterns: List[str]
5338+
) -> bool:
52685339
"""Extended visibility check including pattern normalization."""
5269-
if not args.normalize:
5340+
if not (level or patterns):
52705341
return visible(event)
52715342

52725343
if not visible(event):
52735344
return False
52745345

5275-
return normalize_and_deduplicate_event(event) is not None
5346+
return normalize_and_deduplicate_event(event, level, patterns) is not None
52765347

52775348

52785349
def events_from_linebased(
@@ -5892,7 +5963,9 @@ def main():
58925963

58935964
if args.add_ts:
58945965
event["_klp_ts"] = now_rfc3339()
5895-
if visible_with_normalize(event):
5966+
if visible_with_normalize(
5967+
event, args.normalize_level, args.normalize_patterns
5968+
):
58965969
if args.fuse is not None or args.mark_gaps is not None:
58975970
ts_datetime = get_timestamp_datetime(event)
58985971
if ts_datetime is None:

0 commit comments

Comments
 (0)