-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrc2csv.py
118 lines (89 loc) · 2.63 KB
/
rc2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import midi
import urllib
import urllib2
import json
import math
import datetime
import sys
import os.path
wikiApiUrl = 'https://www.wikidata.org/w/api.php'
if len(sys.argv) != 3:
print "Usage: {0} <title> <output-dir>".format( sys.argv[0] )
sys.exit(2)
def fetchRecentChanges( title, **params ):
url = wikiApiUrl
params['titles'] = title
if not 'rvlimit' in params:
params['rvlimit'] = 500
if not 'rvprop' in params:
params['rvprop'] = 'ids|timestamp|user|userid|flags|tages|size|comment'
params['action'] = 'query'
params['prop'] = 'revisions'
params['rvdir'] = 'newer'
params['format'] = 'json'
url = url + '?' + urllib.urlencode( params )
print "Fetching data from", url
response = urllib2.urlopen( url )
if response.getcode() >= 400:
raise Exception( 'HTTP error: ' + response.getcode() + "\nFrom: " + response.geturl() )
if response.info().getsubtype() != 'json':
raise Exception( 'Not JSON: ' + response.info().gettype() + "\nFrom: " + response.geturl() )
jsonString = response.read()
apiResponse = json.loads( jsonString )
for id, pageData in apiResponse['query']['pages'].items():
break #all we need is the first value in pageData
return pageData['revisions']
def isoTimeDelta( aTime, bTime ):
a = datetime.datetime.strptime( aTime, "%Y-%m-%dT%H:%M:%SZ" )
b = datetime.datetime.strptime( bTime, "%Y-%m-%dT%H:%M:%SZ" )
delta = a - b
return int( delta.total_seconds() )
def addDeltas( revisions ):
revisionsWithDeltas = []
prev = None
for rev in revisions:
newRow = rev
if prev is None:
newRow['delta-time'] = 0
newRow['delta-size'] = rev['size']
else:
newRow['delta-time'] = isoTimeDelta( rev['timestamp'], prev['timestamp'] )
newRow['delta-size'] = rev['size'] - prev['size']
prev = rev
revisionsWithDeltas.append( newRow )
return revisionsWithDeltas
def writeCsv( filename, rc ):
f = open( filename, 'wb')
fields = (
'revid',
'timestamp',
'delta-time',
'size',
'delta-size',
'userid',
'user',
'comment',
)
s = u"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % fields
f.write( s.encode('utf8') )
for row in rc:
fields = ( row['revid'],
row['timestamp'],
row['delta-time'],
row['size'],
row['delta-size'],
row['userid'],
row['user'] if 'user' in row else "",
row['comment'] if 'comment' in row else "",
)
s = u"%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % fields
f.write( s.encode('utf8') )
f.close()
title = sys.argv[1]
csvfile = sys.argv[2]
if os.path.isdir( csvfile ):
csvfile = "%s/%s.csv" % ( csvfile, title )
rc = fetchRecentChanges( title )
rc = addDeltas( rc )
writeCsv( csvfile, rc )
print "Events written to ", csvfile