-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhackathon.py
117 lines (85 loc) · 3.33 KB
/
hackathon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import json
file_name = 'example.html'
def get_node_text(node):
"""node can be any tag with children, and you want to get inner text out of.
ex input: <p>text<a>text2</a>text3</p>"""
if node.name == 'p':
return ' '.join(node.get_text().split('\n'))
return '\n'.join(node.get_text().split('\n'))
def next_p_texts(node):
"""Given a node, it returns a list of the insider texts of all next <p> and <ul>
tags at same node level. The input node could be a <heading> or a <p> tag.
"""
if not node or node.name in ['h2', 'h3', 'h4', 'h5', 'h6']:
return []
else:
return [get_node_text(node)] + next_p_texts(node.find_next_sibling(['p', 'ul', 'h2', 'h3', 'h4', 'h5', 'h6']))
def next_h_tags(node):
"""Given an <h> node, it returns a list of the adjacent <h> siblings at same node level."""
if not node or node.name == 'h2':
return []
else:
return [node] + next_h_tags(node.find_next_sibling(['h2', 'h3', 'h4', 'h5', 'h6']))
def has_subsection(h2_node):
"""Returns a tuple (True, first subsection node), if any"""
has_sub = True
while h2_node and has_sub:
h2_node = h2_node.next_sibling.next_sibling
if h2_node and h2_node.name in ['h3', 'h4', 'h5', 'h6']:
has_sub = True
break
if h2_node and h2_node.name == 'h2':
has_sub = False
return has_sub, h2_node
def get_subsections(node):
"""Given an <h2> tag, it returns a list of the insider texts of all next <p> and <ul>
tags at certain hierarchy level."""
subsections = []
subsection = has_subsection(node)
if subsection:
next_sibling = subsection[1]
for h in next_h_tags(next_sibling):
subsections.append({
"subsection": h.get_text(),
"paragraphs": next_p_texts(h.find_next_sibling(['p', 'ul']))
})
return subsections
def extract_headers(parent_node):
"""This method take the parent node with class attribute "body searchable-content",
and returns the headers "table of contents" in a JSON format"""
result = {}
# in case of getting the first/level-0 <p> tags
first_node = parent_node.find(['h2', 'p'])
if first_node and first_node.name == 'p':
p_tags_initial = next_p_texts(first_node)
result.update({
"paragraphs": p_tags_initial
})
h2_tags = parent_node.find_all('h2')
if h2_tags:
result.update({
"sections": []
})
for h2 in h2_tags:
section = {
"section": h2.get_text(),
"paragraphs": next_p_texts(h2.find_next_sibling(['p', 'ul']))
}
subsections = get_subsections(h2)
if subsections:
section.update({
"subsections": subsections
})
result['sections'].append(section)
return result
try:
with open(file_name, encoding='utf-8') as f:
contents = f.read()
soup = BeautifulSoup(contents, 'html.parser')
parent_node = soup.find('div', class_="body searchable-content")
with open(file_name + '.result.json', 'w', encoding='utf8') as outfile:
json.dump(extract_headers(parent_node), outfile, ensure_ascii=False)
except IOError as e:
print ('Please change "example.html" with the correct file name.')