-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpostsxml_sql_extract_datetime.py
47 lines (39 loc) · 1.88 KB
/
postsxml_sql_extract_datetime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import re
import html
import os
import sys
import xml.etree.ElementTree as ET
if len(sys.argv) < 2:
print("Please provide a path as a command line argument.")
sys.exit()
path = sys.argv[1]
if not os.path.exists(path):
print(f"The path '{path}' does not exist.")
sys.exit()
select_from_regex = r'<row.*?CreationDate="(.*?)'
original_sql_regex = r'<code>SELECT\s.*?FROM\s.*?</code>'
counter = 0
with open(path, 'r', encoding='utf-8') as f:
with open('sqlcommands.txt', 'w', encoding='utf-8') as outfile: # opening a new file to write output
for line in f:
try:
# matches_creation_date = re.findall(select_from_regex, line)
creation_date_regex = r'CreationDate="([^"]+)"'
creation_date_match = re.search(creation_date_regex, line)
# if creation_date_match:
# creation_date_value = creation_date_match.group(1)
# print(f"Creation Date: {creation_date_value}")
# else:
# print("Creation Date not found.")
matches_original_sql = re.findall(original_sql_regex, line)
for match in matches_original_sql:
counter += 1
if (counter % 10000) == 0:
print(counter)
without_comments = re.sub('--.*?
', '', match)
output_string = html.unescape(html.unescape(without_comments.replace('
', ' '))).replace(
'<code>', '').replace('</code>', '')
# outfile.write(output_string + '\n') # writing output to file for original SQL SELECT FROM pattern
outfile.write(f"{creation_date_match.group(1)[:10]}, {output_string}\n") # writing output to file
except UnicodeDecodeError as e:
print(f"UnicodeDecodeError: {e}")