Skip to content

Commit 9568521

Browse files
Merge pull request #125 from m-appel/121-add-informative-reference-url-and-data-modification-time
Add informative reference url and data modification time
2 parents de1f514 + fd6f8be commit 9568521

37 files changed

+515
-185
lines changed

iyp/__init__.py

+46-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Optional
99

1010
import requests
11+
from github import Github
1112
from neo4j import GraphDatabase
1213

1314
BATCH_SIZE = 50000
@@ -79,6 +80,34 @@ def dict2str(d, eq=':', pfx=''):
7980
return '{' + ','.join(data) + '}'
8081

8182

83+
def get_commit_datetime(repo, file_path):
84+
"""Get the datetime of the latest commit modifying a file in a GitHub repository.
85+
86+
repo: The name of the repository in org/repo format, e.g.,
87+
"InternetHealthReport/internet-yellow-pages"
88+
file_path: The path to the file relative to the repository root, e.g.,
89+
"iyp/__init__.py"
90+
"""
91+
return Github().get_repo(repo).get_commits(path=file_path)[0].commit.committer.date
92+
93+
94+
def set_modification_time_from_last_modified_header(reference, response):
95+
"""Set the reference_time_modification field of the specified reference dict to the
96+
datetime parsed from the Last-Modified header of the specified response if
97+
possible."""
98+
try:
99+
last_modified_str = response.headers['Last-Modified']
100+
# All HTTP dates are in UTC:
101+
# https://www.rfc-editor.org/rfc/rfc2616#section-3.3.1
102+
last_modified = datetime.strptime(last_modified_str,
103+
'%a, %d %b %Y %H:%M:%S %Z').replace(tzinfo=timezone.utc)
104+
reference['reference_time_modification'] = last_modified
105+
except KeyError:
106+
logging.warning('No Last-Modified header; will not set modification time.')
107+
except ValueError as e:
108+
logging.error(f'Failed to parse Last-Modified header "{last_modified_str}": {e}')
109+
110+
82111
class RequestStatusError(requests.HTTPError):
83112
def __init__(self, message):
84113
self.message = message
@@ -109,6 +138,12 @@ def __init__(self, message):
109138
super().__init__(self.message)
110139

111140

141+
class DataNotAvailableError(Exception):
142+
def __init__(self, message):
143+
self.message = message
144+
super().__init__(self.message)
145+
146+
112147
class IYP(object):
113148

114149
def __init__(self):
@@ -548,9 +583,9 @@ def add_links(self, src_node, links):
548583
for i, (type, dst_node, prop) in enumerate(links):
549584

550585
assert 'reference_org' in prop
551-
assert 'reference_url' in prop
586+
assert 'reference_url_data' in prop
552587
assert 'reference_name' in prop
553-
assert 'reference_time' in prop
588+
assert 'reference_time_fetch' in prop
554589

555590
prop = format_properties(prop)
556591

@@ -589,10 +624,12 @@ def __init__(self):
589624
"""IYP and references initialization."""
590625

591626
self.reference = {
592-
'reference_org': 'Internet Yellow Pages',
593-
'reference_url': 'https://iyp.iijlab.net',
594627
'reference_name': 'iyp',
595-
'reference_time': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
628+
'reference_org': 'Internet Yellow Pages',
629+
'reference_url_data': 'https://iyp.iijlab.net',
630+
'reference_url_info': str(),
631+
'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc),
632+
'reference_time_modification': None
596633
}
597634

598635
# connection to IYP database
@@ -617,8 +654,10 @@ def __init__(self, organization, url, name):
617654
self.reference = {
618655
'reference_name': name,
619656
'reference_org': organization,
620-
'reference_url': url,
621-
'reference_time': datetime.combine(datetime.utcnow(), time.min, timezone.utc)
657+
'reference_url_data': url,
658+
'reference_url_info': str(),
659+
'reference_time_fetch': datetime.combine(datetime.utcnow(), time.min, timezone.utc),
660+
'reference_time_modification': None
622661
}
623662

624663
# connection to IYP database

iyp/crawlers/alice_lg/__init__.py

+56-14
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import ipaddress
22
import logging
33
import os
4-
import sys
54
from collections import defaultdict
65
from concurrent.futures import as_completed
76
from datetime import datetime
@@ -84,6 +83,10 @@ def __init__(self,
8483

8584
# URLs to the API
8685
url = url.rstrip('/')
86+
if url.endswith('/api/v1'):
87+
self.reference['reference_url_info'] = url[:-len('/api/v1')]
88+
else:
89+
logging.warning(f'Data URL does not end with "/api/v1", will not set info URL: {url}')
8790
self.urls = {
8891
'routeservers': f'{url}/routeservers',
8992
'neighbors': url + '/routeservers/{rs}/neighbors',
@@ -97,6 +100,8 @@ def __init__(self,
97100
# List of neighbor dicts. Each dict contains information about the route server,
98101
# so we do not keep track of that separately.
99102
self.neighbors = list()
103+
# Dict mapping routeserver_id to the cache time of that server.
104+
self.routeserver_cached_at = dict()
100105
# Dict mapping (routeserver_id, neighbor_id) tuple to a list of route dicts.
101106
self.routes = dict()
102107
# If routes should be fetched or not.
@@ -123,8 +128,6 @@ def decode_json(resp: Response, *args, **kwargs) -> None:
123128
try:
124129
resp.data = resp.json()
125130
except JSONDecodeError as e:
126-
print(f'Failed to retrieve data for {resp.url}', file=sys.stderr)
127-
print(f'Error while reading json data: {e}', file=sys.stderr)
128131
logging.error(f'Error while reading json data: {e}')
129132
logging.error(resp.status_code)
130133
logging.error(resp.headers)
@@ -160,8 +163,6 @@ def fetch_urls(self, urls: list, additional_data=list()) -> Iterable:
160163
except Exception as e:
161164
logging.error(f'Failed to retrieve data for {future}')
162165
logging.error(e)
163-
print(f'Failed to retrieve data for {future}', file=sys.stderr)
164-
print(e, file=sys.stderr)
165166
return False, dict(), None
166167

167168
def fetch_url(self, url: str) -> Tuple[bool, dict]:
@@ -177,7 +178,6 @@ def __fetch_routeservers(self) -> None:
177178
logging.info('Using cached route server information.')
178179
self.routeservers = self.cache_handler.load_cached_object(routeserver_object_name)
179180
else:
180-
print(f'Fetching route servers from {self.urls["routeservers"]}')
181181
logging.info(f'Fetching route servers from {self.urls["routeservers"]}')
182182
is_ok, routeservers_root = self.fetch_url(self.urls['routeservers'])
183183
if not is_ok:
@@ -190,28 +190,61 @@ def __fetch_neighbors(self) -> None:
190190
neighbor_object_name = 'neighbors'
191191
if self.cache_handler.cached_object_exists(neighbor_object_name):
192192
logging.info('Using cached neighbor information.')
193-
self.neighbors = self.cache_handler.load_cached_object(neighbor_object_name)
193+
neighbor_object = self.cache_handler.load_cached_object(neighbor_object_name)
194+
self.routeserver_cached_at = neighbor_object['routeserver_cached_at']
195+
self.neighbors = neighbor_object['neighbors']
194196
else:
195-
print(f'Fetching neighbor information from {len(self.routeservers)} route servers.')
196197
logging.info(f'Fetching neighbor information from {len(self.routeservers)} route servers.')
197198
neighbor_urls = [self.urls['neighbors'].format(rs=rs['id']) for rs in self.routeservers]
198199
failed_routeservers = list()
199-
for is_ok, neighbor_list_root, routeserver_id in self.fetch_urls(neighbor_urls,
200-
additional_data=self.routeservers):
200+
for is_ok, neighbor_list_root, routeserver in self.fetch_urls(neighbor_urls,
201+
additional_data=self.routeservers):
202+
routeserver_id = routeserver['id']
201203
if not is_ok:
202204
failed_routeservers.append(routeserver_id)
203205
continue
206+
try:
207+
cached_at_str = neighbor_list_root['api']['cache_status']['cached_at']
208+
except KeyError:
209+
cached_at_str = str()
210+
if cached_at_str:
211+
cached_at = None
212+
# Alice-LG uses nanosecond-granularity timestamps, which are not
213+
# valid ISO format...
214+
try:
215+
pre, suf = cached_at_str.rsplit('.', maxsplit=1)
216+
if suf.endswith('Z'):
217+
# UTC
218+
frac_seconds = suf[:-1]
219+
tz_suffix = '+00:00'
220+
elif '+' in suf:
221+
# Hopefully a timezone identifier of form +HH:MM
222+
frac_seconds, tz_suffix = suf.split('+')
223+
tz_suffix = '+' + tz_suffix
224+
else:
225+
raise ValueError(f'Failed to get timezone from timestamp :{cached_at_str}')
226+
if not frac_seconds.isdigit():
227+
raise ValueError(f'Fractional seconds are not digits: {cached_at_str}')
228+
# Reduce to six digits (ms).
229+
frac_seconds = frac_seconds[:6]
230+
cached_at_str = f'{pre}.{frac_seconds}{tz_suffix}'
231+
cached_at = datetime.fromisoformat(cached_at_str)
232+
except ValueError as e:
233+
logging.warning(f'Failed to get cached_at timestamp for routeserver "{routeserver_id}": {e}')
234+
if cached_at:
235+
self.routeserver_cached_at[routeserver_id] = cached_at
204236
# Spelling of neighbors/neighbours field is not consistent...
205237
if 'neighbors' in neighbor_list_root:
206238
neighbor_list = neighbor_list_root['neighbors']
207239
elif 'neighbours' in neighbor_list_root:
208240
neighbor_list = neighbor_list_root['neighbours']
209241
else:
210242
logging.error(f'Missing "neighbors"/"neighbours" field in reply: {neighbor_list_root}')
211-
print(f'Missing "neighbors"/"neighbours" field in reply: {neighbor_list_root}', file=sys.stderr)
212243
continue
213244
self.neighbors += neighbor_list
214-
self.cache_handler.save_cached_object(neighbor_object_name, self.neighbors)
245+
neighbor_object = {'routeserver_cached_at': self.routeserver_cached_at,
246+
'neighbors': self.neighbors}
247+
self.cache_handler.save_cached_object(neighbor_object_name, neighbor_object)
215248
if failed_routeservers:
216249
logging.warning(f'Failed to get neighbor information for {len(failed_routeservers)} routeservers: '
217250
f'{failed_routeservers}')
@@ -343,7 +376,15 @@ def run(self) -> None:
343376
if ('details:route_changes' in flattened_neighbor
344377
and isinstance(flattened_neighbor['details:route_changes'], flatdict.FlatDict)):
345378
flattened_neighbor.pop('details:route_changes')
346-
self.reference['reference_url'] = self.urls['neighbors'].format(rs=neighbor['routeserver_id'])
379+
routeserver_id = neighbor['routeserver_id']
380+
self.reference['reference_url_data'] = self.urls['neighbors'].format(rs=routeserver_id)
381+
if routeserver_id in self.routeserver_cached_at:
382+
self.reference['reference_time_modification'] = self.routeserver_cached_at[routeserver_id]
383+
else:
384+
logging.info(f'No modification time for routeserver: {routeserver_id}')
385+
# Set to None to not reuse value of previous loop iteration.
386+
self.reference['reference_time_modification'] = None
387+
347388
member_of_rels.append({'src_id': member_asn, # Translate to QID later.
348389
'dst_id': n.data['ixp_qid'],
349390
'props': [flattened_neighbor, self.reference.copy()]})
@@ -354,7 +395,8 @@ def run(self) -> None:
354395
if self.fetch_routes:
355396
logging.info('Iterating routes.')
356397
for (routeserver_id, neighbor_id), routes in self.routes.items():
357-
self.reference['reference_url'] = self.urls['routes'].format(rs=routeserver_id, neighbor=neighbor_id)
398+
self.reference['reference_url_data'] = self.urls['routes'].format(rs=routeserver_id,
399+
neighbor=neighbor_id)
358400
for route in routes:
359401
prefix = ipaddress.ip_network(route['network']).compressed
360402
origin_asn = route['bgp']['as_path'][-1]

iyp/crawlers/bgpkit/__init__.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,26 @@
33

44
import requests
55

6-
from iyp import BaseCrawler, RequestStatusError
6+
from iyp import (BaseCrawler, RequestStatusError,
7+
set_modification_time_from_last_modified_header)
78

89

910
class AS2RelCrawler(BaseCrawler):
1011
def __init__(self, organization, url, name, af):
1112
"""Initialization: set the address family attribute (af)"""
12-
13-
self.af = af
1413
super().__init__(organization, url, name)
14+
self.af = af
15+
self.reference['reference_url_info'] = 'https://data.bgpkit.com/as2rel/README.txt'
1516

1617
def run(self):
1718
"""Fetch the AS relationship file from BGPKIT website and process lines one by
1819
one."""
1920

2021
req = requests.get(self.url, stream=True)
2122
if req.status_code != 200:
22-
raise RequestStatusError('Error while fetching AS relationships')
23+
raise RequestStatusError(f'Error while fetching AS relationships: {req.status_code}')
24+
25+
set_modification_time_from_last_modified_header(self.reference, req)
2326

2427
rels = []
2528
asns = set()

iyp/crawlers/bgpkit/peerstats.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717

1818

1919
class Crawler(BaseCrawler):
20+
def __init__(self, organization, url, name):
21+
super().__init__(organization, url, name)
22+
self.reference['reference_url_info'] = 'https://data.bgpkit.com/peer-stats/README.md'
2023

2124
def run(self):
2225
"""Fetch peer stats for each collector."""
@@ -49,6 +52,7 @@ def run(self):
4952
prev_day -= timedelta(days=1)
5053
logging.warning("Today's data not yet available!")
5154

55+
self.reference['reference_time_modification'] = self.now
5256
for collector in collectors:
5357
url = URL.format(collector=collector, year=self.now.year,
5458
month=self.now.month, day=self.now.day,
@@ -65,7 +69,7 @@ def run(self):
6569
'BGPCollector',
6670
{'name': stats['collector'], 'project': stats['project']}
6771
)
68-
self.reference['reference_url'] = url
72+
self.reference['reference_url_data'] = url
6973

7074
asns = set()
7175

iyp/crawlers/bgpkit/pfx2asn.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77

88
import requests
99

10-
from iyp import BaseCrawler, RequestStatusError
10+
from iyp import (BaseCrawler, RequestStatusError,
11+
set_modification_time_from_last_modified_header)
1112

1213
URL = 'https://data.bgpkit.com/pfx2as/pfx2as-latest.json.bz2'
1314
ORG = 'BGPKIT'
@@ -22,7 +23,9 @@ def run(self):
2223

2324
req = requests.get(URL, stream=True)
2425
if req.status_code != 200:
25-
raise RequestStatusError('Error while fetching pfx2as relationships')
26+
raise RequestStatusError(f'Error while fetching pfx2as relationships: {req.status_code}')
27+
28+
set_modification_time_from_last_modified_header(self.reference, req)
2629

2730
entries = []
2831
asns = set()
@@ -35,7 +38,7 @@ def run(self):
3538

3639
req.close()
3740

38-
logging.info('Pushing nodes to neo4j...\n')
41+
logging.info('Pushing nodes to neo4j...')
3942
# get ASNs and prefixes IDs
4043
self.asn_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', asns)
4144
self.prefix_id = self.iyp.batch_get_nodes_by_single_prop('Prefix', 'prefix', prefixes)
@@ -48,7 +51,7 @@ def run(self):
4851

4952
links.append({'src_id': asn_qid, 'dst_id': prefix_qid, 'props': [self.reference, entry]}) # Set AS name
5053

51-
logging.info('Pushing links to neo4j...\n')
54+
logging.info('Pushing links to neo4j...')
5255
# Push all links to IYP
5356
self.iyp.batch_add_links('ORIGINATE', links)
5457

iyp/crawlers/bgptools/anycast_prefixes.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66

77
import requests
88

9-
from iyp import BaseCrawler, ConnectionError, RequestStatusError
9+
from iyp import (BaseCrawler, ConnectionError, RequestStatusError,
10+
get_commit_datetime)
1011

1112
# Organization name and URL to data
1213
ORG = 'BGP.Tools'
@@ -38,6 +39,12 @@ def fetch_dataset(url: str):
3839
class Crawler(BaseCrawler):
3940
# Base Crawler provides access to IYP via self.iyp
4041
# and setup a dictionary with the org/url/today's date in self.reference
42+
def __init__(self, organization, url, name):
43+
super().__init__(organization, url, name)
44+
self.repo = 'bgptools/anycast-prefixes'
45+
self.v4_file = 'anycatch-v4-prefixes.txt'
46+
self.v6_file = 'anycatch-v6-prefixes.txt'
47+
self.reference['reference_url_info'] = 'https://bgp.tools/kb/anycatch'
4148

4249
def run(self):
4350
ipv4_prefixes_url = get_dataset_url(URL, 4)
@@ -51,13 +58,16 @@ def run(self):
5158
ipv6_prefixes_filename = os.path.join(tmpdir, 'anycast_ipv6_prefixes.txt')
5259

5360
# Fetch data and push to IYP.
54-
self.reference['reference_url'] = ipv4_prefixes_url # Overriding the reference_url according to prefixes
61+
# Overriding the reference_url_data according to prefixes
62+
self.reference['reference_url_data'] = ipv4_prefixes_url
63+
self.reference['reference_time_modification'] = get_commit_datetime(self.repo, self.v4_file)
5564
ipv4_prefixes_response = fetch_dataset(ipv4_prefixes_url)
5665
logging.info('IPv4 prefixes fetched successfully.')
5766
self.update(ipv4_prefixes_response, ipv4_prefixes_filename)
5867
logging.info('IPv4 prefixes pushed to IYP.')
5968

60-
self.reference['reference_url'] = ipv6_prefixes_url
69+
self.reference['reference_url_data'] = ipv6_prefixes_url
70+
self.reference['reference_time_modification'] = get_commit_datetime(self.repo, self.v6_file)
6171
ipv6_prefixes_response = fetch_dataset(ipv6_prefixes_url)
6272
logging.info('IPv6 prefixes fetched successfully.')
6373
self.update(ipv6_prefixes_response, ipv6_prefixes_filename)

0 commit comments

Comments
 (0)