forked from GeoscienceAustralia/VocPrez
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.py
186 lines (152 loc) · 5.67 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import urllib
import re
from rdflib import URIRef
import markdown
import os
import time
import pickle
import logging
import errno
import _config as config
from bs4 import BeautifulSoup
APP_DIR = os.path.dirname(os.path.abspath(__file__))
def render_concept_tree(html_doc):
soup = BeautifulSoup(html_doc, "html.parser")
# concept_hierarchy = soup.find(id='concept-hierarchy')
uls = soup.find_all("ul")
for i, ul in enumerate(uls):
# Don't add HTML class nested to the first 'ul' found.
if not i == 0:
ul["class"] = "nested"
if ul.parent.name == "li":
temp = BeautifulSoup(str(ul.parent.a.extract()), "html.parser")
ul.parent.insert(
0, BeautifulSoup('<span class="caret">', "html.parser")
)
ul.parent.span.insert(0, temp)
return soup
def url_encode(s):
try:
return urllib.parse.quote(s)
except:
pass
def url_decode(s):
try:
return urllib.parse.unquote(s)
except:
pass
def make_title(s):
# make title from URI
title = " ".join(s.split("#")[-1].split("/")[-1].split("_")).title()
# replace dashes and periods with whitespace
title = re.sub("[-.]+", " ", title).title()
return title
def parse_markdown(s):
return markdown.markdown(s)
def is_email(email):
"""
Check if the email is a valid email.
:param email: The email to be tested.
:return: True if the email matches the static regular expression, else false.
:rtype: bool
"""
pattern = r"[a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"
return True if re.search(pattern, email) is not None else False
def strip_mailto(email):
return email[7:]
def contains_mailto(email):
if email[:7] == "mailto:":
return True
return False
def is_url(url):
"""
Check if the url is a valid url.
:param url: The url to be tested.
:type url: str
:return: True if the url passes the validation, else false.
:rtype: bool
"""
if isinstance(url, URIRef):
return True
pattern = re.compile(
r"^(?:http|ftp)s?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
r"localhost|" # localhost...
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
return True if re.search(pattern, url) is not None else False
def cache_read(cache_file_name):
"""
Function to read object from cache if cache file is younger than cache_hours. Returns None on failure
"""
cache_seconds = config.VOCAB_CACHE_HOURS * 3600
cache_file_path = os.path.join(config.VOCAB_CACHE_DIR, cache_file_name)
if os.path.isfile(cache_file_path):
# if the cache file is younger than cache_hours days, then try to read it
cache_file_age = time.time() - os.stat(cache_file_path).st_mtime
logging.debug("Cache file age: {0:.2f} hours".format(cache_file_age / 3600))
# if the cache file is older than VOCAB_CACHE_HOURS, ignore it
if cache_file_age <= cache_seconds:
try:
with open(cache_file_path, "rb") as f:
cache_object = pickle.load(f)
f.close()
if cache_object: # Ignore empty file
logging.debug(
"Read cache file {}".format(os.path.abspath(cache_file_path))
)
return cache_object
except Exception as e:
logging.debug(
"Unable to read cache file {}: {}".format(
os.path.abspath(cache_file_path), e
)
)
pass
else:
logging.debug(
"Ignoring old cache file {}".format(os.path.abspath(cache_file_path))
)
return
def cache_write(cache_object, cache_file_name):
"""
Function to write object to cache if cache file is older than cache_hours.
"""
cache_seconds = config.VOCAB_CACHE_HOURS * 3600
cache_file_path = os.path.join(config.VOCAB_CACHE_DIR, cache_file_name)
if os.path.isfile(cache_file_path):
# if the cache file is older than VOCAB_CACHE_HOURS days, delete it
cache_file_age = time.time() - os.stat(cache_file_path).st_mtime
# if the cache file is older than VOCAB_CACHE_HOURS days, delete it
if cache_seconds and cache_file_age > cache_seconds:
logging.debug(
"Removing old cache file {}".format(os.path.abspath(cache_file_path))
)
os.remove(cache_file_path)
else:
logging.debug(
"Retaining recent cache file {}".format(
os.path.abspath(cache_file_path)
)
)
return # Don't do anything - cache file is too young to die
try:
os.makedirs(config.VOCAB_CACHE_DIR)
logging.debug(
"Cache directory {} created".format(os.path.abspath(config.VOCAB_CACHE_DIR))
)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(config.VOCAB_CACHE_DIR):
pass
else:
raise
if cache_object: # Don't write empty file
with open(cache_file_path, "wb") as cache_file:
pickle.dump(cache_object, cache_file)
cache_file.close()
logging.debug("Cache file {} written".format(cache_file_path))
else:
logging.debug("Empty object ignored")