-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgz_mongo.py
217 lines (206 loc) · 8.76 KB
/
gz_mongo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import re
from random import random
import gz2
import gz3
import gz4_sloan_ukidss as sloan_ukidss
import gz4_ferengi as ferengi
import gz4_candels as candels
from pprint import pprint
from bson import objectid
#=====================================================
#function to split lists on a value, used to pull out
#anything odd nodes
#taken from http://stackoverflow.com/questions/4322705/split-a-list-into-nested-lists-on-a-value
def ssplit2(seq,splitters):
seq=list(seq)
if splitters and seq:
splitters=set(splitters).intersection(seq)
if splitters:
result=[]
begin=0
for end in range(len(seq)):
if seq[end] in splitters:
if end > begin:
result.append(seq[begin:end])
begin=end+1
if begin<len(seq):
result.append(seq[begin:])
return result
return [seq]
#=====================================================
class GZ_base:
def __init__(self):
self.survey_name=''
self.survey=None
self.connect=None
self.debug=False
self.anything_odd=''
self.get_links=self.get_links_mongo
def get_links_mongo(self,m_id,weight=False):
#m_id is the mongo "_id" value for the galaxy subject
#find all classifications with the propper subject id
c=self.connect.db.galaxy_zoo_classifications.find({'subject_ids':m_id})
vote_dict={}
odd_dict={}
for i in c:
w=1
votes=[0]
for j in i['annotations']:
vote=j.items()[0]
if (vote[0]==self.anything_odd) and (not isinstance(vote[1],list)):
vote=(vote[0],[vote[1]])
if isinstance(vote[1],list):
#this is the answer to "anything odd"
for k in vote[1]:
if (k[0]=='x') or (self.survey_name=='candels'):
key=self.survey.convert[k]
odd_dict[key]=odd_dict.get(key,0)+w
else:
if vote[0] not in ['lang','user_agent']:
votes.append(self.survey.convert[vote])
if self.debug:
print votes,self.survey.valid_path(votes)
if self.survey.valid_path(votes):
votes=[v for v in votes if v!=-1]
for key in zip(votes[:-1],votes[1:]):
vote_dict[key]=vote_dict.get(key,0)+w
odd_list=map(list,odd_dict.items())
if self.debug:
print odd_list
odd_list.sort(key=lambda x: x[1])
odd_list=odd_list[::-1]
self.odd_list=[{'name':k[0], 'value':k[1]} for k in odd_list]
self.links=[{'source':k[0], 'target':k[1], 'value': v} for k,v in vote_dict.iteritems()]
def get_links_sql(self,m_id,weight=False):
# The annotations field is different for the SQL converted subjects
c=self.connect.db.galaxy_zoo_classifications.find({'subject_ids':m_id})
path_dict={}
odd_dict={}
self.user_id_sql={}
for j in c:
w=1
i=j['annotations']
if weight:
if self.survey_name=='GZ2':
k_mean=self.connect.db.gz2_sql_users.find_one({'user_id_sql':j['user_id_sql']})
else:
k_mean=self.connect.db.gz3_sql_users.find_one({'user_id_sql':j['user_id_sql']})
if k_mean is not None: #sometimes a vote path is empy and is not in the user_id list
w=min(1,(k_mean['k_mean']/0.6)**8.5)
# detect index answering the frist question in a classification
# this will avoid the issue when people hit the reset button
idx_first=[x for x,y in enumerate(i) if (y==1 or y==2 or y==3)]
# only take the final time through the tree
if len(idx_first)>0: # sometimes the first node is missing!
i=i[idx_first[-1]:]
i=[0]+i
# check that all votes makes a vlid path through the tree (no missing or repeated nodes!)
if self.survey.valid_path(i):
self.user_id_sql[j['user_id_sql']]=i
if 14 in i:
#could be more then 1 anything odd answer
ii,jj=ssplit2(i,[14])
for j in jj:
odd_dict[j]=odd_dict.get(j,0)+w
i=ii
elif 15 in i:
i=i[:-1]
for key in zip(i[:-1],i[1:]):
path_dict[key]=path_dict.get(key,0)+w
self.odd_list=map(list,odd_dict.items())
self.odd_list.sort(key=lambda x: x[1])
self.odd_list=self.odd_list[::-1]
self.odd_list=[{'name':k[0], 'value':k[1]} for k in self.odd_list]
self.links=[{'source':k[0], 'target':k[1], 'value': v} for k,v in path_dict.iteritems()]
def run(self,argv='180 0'):
if argv=='random':
s=self.connect.db.galaxy_zoo_subjects.find_one({'metadata.survey':self.survey_name, 'random':{'$gt':random()}})
else:
argv=re.findall(r"[\w.-]+",argv)
L=len(argv)
if L==1:
s=self.connect.db.galaxy_zoo_subjects.find_one({'zooniverse_id':argv[0]})
else:
ra,dec=map(float,argv)
s=self.connect.db.galaxy_zoo_subjects.find_one({'location_geo':{'$near':{'$geometry':{ 'type' : 'Point' , 'coordinates' :[ra-180,dec]}}}, 'metadata.survey':self.survey_name})
if self.debug:
pprint(s)
#get the weighted links
self.get_links(s['_id'],weight=True)
self.links_weight=self.links[:]
self.odd_list_weight=self.odd_list[:]
#get the raw links
self.get_links(s['_id'],weight=False)
#add weighted votes into links
all_links=[]
for r,w in zip(self.links,self.links_weight):
all_links.append(r)
all_links[-1]['_value']=[r['value'],w['value']]
all_odd=[]
for r,w in zip(self.odd_list,self.odd_list_weight):
all_odd.append(r)
all_odd[-1]['_value']=[r['value'],w['value']]
s_strip={k:v for k,v in s.iteritems() if (k not in ['_id','project_id','location_geo','random','zooniverse_id','group_id','workflow_ids','group','follower_ids'])}
metadata=scrub_dict(s_strip)
if self.survey_name in ['sloan','ukidss','ferengi','candels']:
talk='http://talk.galaxyzoo.org/#/subjects/'+s['zooniverse_id']
exam='http://www.galaxyzoo.org/#/examine/'+s['zooniverse_id']
else:
talk=''
exam=''
return {'nodes':self.survey.nodes,'links':all_links,'links_weight':self.links_weight,'image_url':s['location']['standard'],'ra':s['coords'][0],'dec':s['coords'][1],'gal_name':s['zooniverse_id'],'odd_list':all_odd,'metadata':metadata,'talk':talk,'exam':exam}
def scrub_dict(d):
d={k:v for k,v in d.iteritems() if (v not in [-9999,None,"null",-999])}
d={k:v for k,v in d.iteritems() if (not isinstance(v,objectid.ObjectId))}
for k,v in d.iteritems():
if isinstance(v,dict):
d[k]=scrub_dict(v)
return d
class GZ2(GZ_base):
def __init__(self,connect,debug=False):
self.connect=connect
self.debug=debug
self.survey_name='GZ2'
self.survey=gz2
self.anything_odd=None
self.get_links=self.get_links_sql
class GZ3(GZ_base):
def __init__(self,connect,debug=False):
self.connect=connect
self.debug=debug
self.survey_name='Hubble'
self.survey=gz3
self.anything_odd=None
self.get_links=self.get_links_sql
class GZ4_sloan(GZ_base):
def __init__(self,connect,debug=False):
self.connect=connect
self.debug=debug
self.survey_name='sloan'
self.survey=sloan_ukidss
self.anything_odd='sloan-6'
self.get_links=self.get_links_mongo
class GZ4_ukidss(GZ_base):
def __init__(self,connect,debug=False):
self.connect=connect
self.debug=debug
self.survey_name='ukidss'
self.survey=sloan_ukidss
self.anything_odd='ukidss-6'
self.get_links=self.get_links_mongo
class GZ4_ferengi(GZ_base):
def __init__(self,connect,debug=False):
self.connect=connect
self.debug=debug
self.survey_name='ferengi'
self.survey=ferengi
self.anything_odd='ferengi-17'
self.get_links=self.get_links_mongo
class GZ4_candels(GZ_base):
def __init__(self,connect,debug=False):
self.connect=connect
self.debug=debug
self.survey_name='candels'
self.survey=candels
self.anything_odd='candels-16'
self.get_links=self.get_links_mongo