-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest-bs.py
64 lines (48 loc) · 1.44 KB
/
test-bs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import logging
import sys
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
tag_names_of_possible_root_elements = [
"feed",
"html",
"math",
"rss",
"svg",
]
other_tags_to_delete = [
"iframe",
"link",
"meta",
"noscript",
"script",
"style",
]
def delete_specified_tag_elements(
content: str, tags_to_delete, parser_to_use="lxml"
) -> str:
soup = BeautifulSoup(content, parser_to_use)
for tag_name in tags_to_delete:
for tag in soup.find_all(tag_name):
tag.decompose()
return soup.prettify()
if __name__ == "__main__":
logger.setLevel(logging.DEBUG)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
# formatter = MicrosecondFormatter("%(asctime)s %(levelname)-8s %(message)s")
formatter = logging.Formatter(
"%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
if len(sys.argv) < 2:
print("Usage: python utils_mimetypes_magic.py <local_file>")
sys.exit(1)
local_file = sys.argv[1]
# local_file = "/srv/timbos-hn-reader/temp/test1.xml"
log_prefix = ""
with open(local_file, mode="r", encoding="utf-8") as f:
content = f.read()
content = delete_specified_tag_elements(content=content, tags_to_delete="html")
print(content)