-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsyncProjectMemberEdits.py
124 lines (92 loc) · 3.44 KB
/
syncProjectMemberEdits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#! /usr/bin/env python
# Copyright 2013 Mdgilbert
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
FOR FINAL PAPER - MAY NOT USE THIS, SEE getCoordinationEdits.py -
This script is intended to grab the details of revisions project members
make during their membership.
Membership is defined as any user who has a link to their user page from a
project page, sub-page, or template transcluded on either of those pages
(excluding talk pages).
"""
# Import local tools
from pycommon.util.util import *
from pycommon.db.db import db
# Make sure we're dealing with utf-8 strings
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
# Allow threading
import Queue
import threading
import time
# We'll need to grab user links from Wikipedia API
import urllib2
# Need the unquote_plus function
import urllib
# And BeautifulSoup to parse the returned html
from bs4 import BeautifulSoup
# Regular expressions needed to parse user links
import re
# To print the stack trace if we error out
import traceback
# From mako
## THIS IS MAGIC I FOUND ON THE INTERNET
import re, urlparse
def urlEncodeNonAscii(b):
return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)
def iriToUri(iri):
parts= urlparse.urlparse(iri)
return urlparse.urlunparse(
part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
for parti, part in enumerate(parts))
## END MAGIC
debug = 0
threads = 6
queue = Queue.Queue(threads)
ww = get_ww()
localDb = "reflex_relations_2014"
remoteDb = "enwiki_p_local"
class syncMemberEdits(threading.Thread):
""" Threaded approach to syncing member edits """
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
self.ldb = db(localDb, self.getName())
self.rdb = db(remoteDb, self.getName())
def run(self):
"""
Gets all edits project members make, recods them locally
"""
while True:
project = self.queue.get()
# We're done
out("[%s] Completed inserting member edits" % (project["p_title"]))
self.queue.task_done()
def main():
ldb = db(localDb)
# Spawn a pool of threads
for i in range(threads):
m = syncMemberEdits(queue)
m.setDaemon(True)
m.start()
# Fetch the projects we're interested in
query = 'select * from project where p_title in ("WikiProject_Feminism", "WikiProject_Piracy", "WikiProject_Medicine", "WikiProject_Plants", "WikiProject_Chemistry", "WikiProject_Spoken_Wikipedia", "WikiProject_Countering_systemic_bias", "WikiProject_Copyright_Cleanup", "WikiProject_Missing_encyclopedic_articles", "WikiProject_Outreach")'
lc = ldb.execute(query)
rows = lc.fetchall()
for row in rows:
queue.put(row)
# Wait on the queue until everything is done
queue.join()
ldb.close()
if __name__ == "__main__":
main()