-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdeltajson.py
executable file
·124 lines (103 loc) · 4.37 KB
/
deltajson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/python
import json
import sys
import logging
import time
# native variant (needs native yajl lib installation)
import ijson.backends.yajl2_cffi as ijson
# pure python variant (ca. 66% slower overall):
# import ijson
# for using md5:
# import hashlib
# for using MurmurHash:
import mmh3
import binascii
from jsonpath_rw import jsonpath, parse
# https://pypi.python.org/pypi/jsonpath-rw
# TODO dockerize the JSON variant (needs native libs)
startTime = time.time()
if len(sys.argv) <= 2:
print('new json file name and ID jsonpath parameters are mandatory')
exit()
fullfile_name = sys.argv[1]
entriesProperty = sys.argv[2]
idJsonPath = sys.argv[3]
idJsonParser = parse(idJsonPath)
deltafile_name = fullfile_name + '.changes.json'
fingerprintsfile_new_name = fullfile_name + '.fingerprints.json'
fingerprintsfile_old_name = ""
if len(sys.argv) > 4:
fingerprintsfile_old_name = sys.argv[4]
if fingerprintsfile_new_name == fingerprintsfile_old_name:
print(
'ERROR: last fingerprints file name must differ from new name ' + fingerprintsfile_new_name)
exit()
with open(
fullfile_name, 'rb') as fullfile_new, open(
deltafile_name, 'wb') as deltafile, open(
fingerprintsfile_new_name , 'w') as fingerprintsfile_new:
if fingerprintsfile_old_name:
try:
fingerprintsfile_old = open(fingerprintsfile_old_name, 'r')
fingerprints_old = json.load(fingerprintsfile_old)
except IOError:
print('ERROR: could not open file ' + fingerprintsfile_old_name)
exit()
else:
print('INFO: no old fingerprints file name passed, starting from scratch')
fingerprints_old = dict()
fingerprints_new = dict()
idSet = set() # to check uniqueness. Faster than using a list or dict.
objCount = 0
deltacount = 0
duplicateIds = list()
jsonObjects = ijson.items(fullfile_new, entriesProperty + '.item')
deltafile.write('{"' + entriesProperty + '":[\n')
objects = (o for o in jsonObjects)
for obj in objects:
objCount += 1
try:
# uses first match of the jsonPath as ID
objId = str(idJsonParser.find(obj)[0].value)
except:
logging.exception("message")
print(str(obj))
exit()
if objId in idSet: # ignore and remember duplicate ids
duplicateIds.append(objId)
else:
idSet.add(objId)
objJsonString = json.dumps(obj)
# mmh3 on test file: 57 secs
# md5 on test file: 58 secs
# -> no difference -> choose mmh3 for collision avoidance, md5 for portability
# objDigest = hashlib.md5(objJsonString).hexdigest()
objDigest = binascii.hexlify(mmh3.hash_bytes(objJsonString))
fingerprints_new[objId] = objDigest
# if the obj is new or the obj has changed, write delta.
# (removes items from old fingerprints to find implicit deletions)
if (objId not in fingerprints_old) or (fingerprints_old.pop(objId) != objDigest):
if deltacount > 0: deltafile.write('\n,')
deltacount += 1
deltafile.write(objJsonString)
deltafile.write('\n]}')
print('DONE: processed ' + '{:,}'.format(objCount) + ' JSON objects, ' + '{:,}'.format(
len(idSet)) + ' unique IDs, found ' + '{:,}'.format(deltacount) + ' changed and ' + '{:,}'.format(
len(fingerprints_old)) + ' removed entries.')
# log duplicate ids
if len(duplicateIds) > 0:
print('WARN: ' + '{:,}'.format(
len(duplicateIds)) + ' duplicate IDs found. Used only first occurrences, writing to file')
with open(fullfile_name + '.duplicateIds.json', 'w') as duplicateIds_file:
json.dump(duplicateIds, duplicateIds_file, indent=2)
# write deleted fingerprints if some remained:
if len(fingerprints_old) > 0:
print('INFO: some entries have disappeared since the last file. Writing IDs to file')
with open(fullfile_name + '.removedIds.json', 'w') as removedIds_file:
json.dump(fingerprints_old, removedIds_file, indent=2)
# persist new fingerprints and deltafile:
deltafile.flush()
print('wrote delta file')
json.dump(fingerprints_new, fingerprintsfile_new)
print('wrote new fingerprints file')
print('duration: ' + str(time.time() - startTime) + ' seconds')