Skip to content

Commit b4a76fd

Browse files
committed
fix combineAllJson
1 parent 37bde48 commit b4a76fd

6 files changed

+77
-62
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
config.json
2-
old
2+
old
3+
*.pyc
File renamed without changes.

combined/combinedAllLabels.json

+1
Large diffs are not rendered by default.

combined/combinedTokenLabels.json

+1
Large diffs are not rendered by default.

main.py

+72-40
Original file line numberDiff line numberDiff line change
@@ -11,56 +11,59 @@ def login():
1111
driver.get('https://etherscan.io/login')
1212
driver.implicitly_wait(5)
1313
driver.find_element("id",
14-
"ContentPlaceHolder1_txtUserName").send_keys(config['ETHERSCAN_USER'])
14+
"ContentPlaceHolder1_txtUserName").send_keys(config['ETHERSCAN_USER'])
1515
driver.find_element(
16-
"id","ContentPlaceHolder1_txtPassword").send_keys(config['ETHERSCAN_PASS'])
16+
"id", "ContentPlaceHolder1_txtPassword").send_keys(config['ETHERSCAN_PASS'])
1717

1818
input("Press enter once logged in")
1919

2020

2121
# Retrieve label information and saves as JSON/CSV
2222
def getLabel(label, label_type="account", input_type='single'):
23-
baseUrl = 'https://etherscan.io/{}s/label/{}?subcatid={}&size=100&start={}' # https://etherscan.io/tokens/label/gaming?subcatid=undefined&size=100&start=0
23+
baseUrl = 'https://etherscan.io/{}s/label/{}?subcatid={}&size=100&start={}'
2424
index = 0 # Initialize start index at 0
2525
table_list = []
2626

27-
driver.get(baseUrl.format(label_type, label, 'undefined',index))
27+
driver.get(baseUrl.format(label_type, label, 'undefined', index))
2828
driver.implicitly_wait(5)
29-
29+
3030
# Find all elements using driver.find_elements where class matches "nav-link"
3131
# This is used to find all subcategories
32-
elems = driver.find_elements("class name","nav-link")
32+
elems = driver.find_elements("class name", "nav-link")
3333
subcat_id_list = []
3434

3535
# Loop through elems and append all values to subcat_id_list
3636
for elem in elems:
3737
elemVal = elem.get_attribute("val")
38-
#print(elem.text,elemVal,elem.get_attribute("class")) # Used for debugging elements returned
39-
if (elemVal is not None): subcat_id_list.append(elemVal)
38+
# print(elem.text,elemVal,elem.get_attribute("class")) # Used for debugging elements returned
39+
if (elemVal is not None):
40+
subcat_id_list.append(elemVal)
4041

41-
print(label,'subcat_values:',subcat_id_list)
42+
print(label, 'subcat_values:', subcat_id_list)
4243

4344
# Bug fix: When there's 0 subcat id found aka ONLY MAIN, we manually add 'undefined' to subcat_id_list
44-
if (len(subcat_id_list) == 0): subcat_id_list.append('undefined')
45+
if (len(subcat_id_list) == 0):
46+
subcat_id_list.append('undefined')
4547

46-
for table_index,subcat_id in enumerate(subcat_id_list):
48+
for table_index, subcat_id in enumerate(subcat_id_list):
4749
index = 0 # Initialize start index at 0
4850
driver.implicitly_wait(5)
49-
driver.get(baseUrl.format(label_type, label, subcat_id,index))
50-
time.sleep(5) #TODO: allow customization by args
51+
driver.get(baseUrl.format(label_type, label, subcat_id, index))
52+
time.sleep(5) # TODO: allow customization by args
5153

5254
while (True):
53-
print('Index:', index,'Subcat:',subcat_id)
55+
print('Index:', index, 'Subcat:', subcat_id)
5456

5557
try:
5658
# Select relevant table from multiple tables in the page, based on current table index
5759
curTable = pd.read_html(driver.page_source)[table_index]
5860
if label_type == "account":
59-
curTable = curTable[:-1] # Remove last item which is just sum
61+
# Remove last item which is just sum
62+
curTable = curTable[:-1]
6063
print(curTable)
6164

6265
# Retrieve all addresses from table
63-
elems = driver.find_elements("xpath","//tbody//a[@href]")
66+
elems = driver.find_elements("xpath", "//tbody//a[@href]")
6467
addressList = []
6568
addrIndex = len('https://etherscan.io/address/')
6669
for elem in elems:
@@ -70,7 +73,7 @@ def getLabel(label, label_type="account", input_type='single'):
7073

7174
# Replace address column in newTable dataframe with addressList
7275
curTable['Address'] = addressList
73-
except Exception as e:
76+
except Exception as e:
7477
print(e)
7578
print(label, "Skipping label due to error")
7679
return
@@ -81,17 +84,19 @@ def getLabel(label, label_type="account", input_type='single'):
8184
if (len(curTable.index) == 100):
8285
if label_type == "account":
8386
index += 100
84-
driver.get(baseUrl.format(label_type, label, subcat_id,index))
87+
driver.get(baseUrl.format(
88+
label_type, label, subcat_id, index))
8589
if label_type == "token":
86-
next_icon_elems = driver.find_elements("class name", "fa-chevron-right")
90+
next_icon_elems = driver.find_elements(
91+
"class name", "fa-chevron-right")
8792
next_icon_elems[0].click()
88-
time.sleep(5) #TODO: allow customization by args
93+
time.sleep(5) # TODO: allow customization by args
8994
else:
9095
break
9196

9297
df = pd.concat(table_list) # Combine all dataframes
9398
df.fillna('', inplace=True) # Replace NaN as empty string
94-
df.index = range(len(df.index)) # Fix index for df
99+
df.index = range(len(df.index)) # Fix index for df
95100

96101
# Prints length and save as a csv
97102
print(label, 'Df length:', len(df.index))
@@ -100,10 +105,10 @@ def getLabel(label, label_type="account", input_type='single'):
100105
# Save as json object with mapping address:nameTag
101106
if label_type == "account":
102107
addressNameDict = dict([(address, nameTag)
103-
for address, nameTag in zip(df.Address, df['Name Tag'])])
108+
for address, nameTag in zip(df.Address, df['Name Tag'])])
104109
if label_type == "token":
105110
addressNameDict = dict([(address, nameTag)
106-
for address, nameTag in zip(df.Address, df['Token Name'])])
111+
for address, nameTag in zip(df.Address, df['Token Name'])])
107112
with open('{}s/{}.json'.format(label_type, label), 'w', encoding='utf-8') as f:
108113
json.dump(addressNameDict, f, ensure_ascii=True)
109114

@@ -117,7 +122,7 @@ def getLabel(label, label_type="account", input_type='single'):
117122

118123
# Combines all JSON into a single file combinedLabels.json
119124
def combineAllJson():
120-
combinedJSON = {}
125+
combinedAccountJSON = {}
121126

122127
# iterating over all files
123128
for files in os.listdir('./accounts'):
@@ -126,21 +131,45 @@ def combineAllJson():
126131
with open('./accounts/{}'.format(files)) as f:
127132
dictData = json.load(f)
128133
for address, nameTag in dictData.items():
129-
if address not in combinedJSON:
130-
combinedJSON[address] = {'name': nameTag, 'labels': []}
131-
combinedJSON[address]['labels'].append(files[:-5])
134+
if address not in combinedAccountJSON:
135+
combinedAccountJSON[address] = {
136+
'name': nameTag, 'labels': []}
137+
combinedAccountJSON[address]['labels'].append(files[:-5])
138+
else:
139+
continue
140+
141+
combinedTokenJSON = {}
142+
for files in os.listdir('./tokens'):
143+
if files.endswith('json'):
144+
print(files) # printing file name of desired extension
145+
with open('./tokens/{}'.format(files)) as f:
146+
dictData = json.load(f)
147+
for address, nameTag in dictData.items():
148+
if address not in combinedTokenJSON:
149+
combinedTokenJSON[address] = {
150+
'name': nameTag, 'labels': []}
151+
combinedTokenJSON[address]['labels'].append(files[:-5])
132152
else:
133153
continue
134154

135-
with open('combined/combinedLabels.json', 'w', encoding='utf-8') as f:
136-
json.dump(combinedJSON, f, ensure_ascii=True)
155+
combinedAllJSON = {
156+
key: { **combinedAccountJSON.get(key, {}), **combinedTokenJSON.get(key, {}) }
157+
for key in set(list(combinedAccountJSON.keys())+list(combinedTokenJSON.keys()))
158+
}
159+
160+
with open('combined/combinedAccountLabels.json', 'w', encoding='utf-8') as f:
161+
json.dump(combinedAccountJSON, f, ensure_ascii=True)
162+
with open('combined/combinedTokenLabels.json', 'w', encoding='utf-8') as f:
163+
json.dump(combinedTokenJSON, f, ensure_ascii=True)
164+
with open('combined/combinedAllLabels.json', 'w', encoding='utf-8') as f:
165+
json.dump(combinedAllJSON, f, ensure_ascii=True)
137166

138167
# Retrieves all labels from labelcloud and saves as JSON/CSV
139168
def getAllLabels():
140169
driver.get('https://etherscan.io/labelcloud')
141170
driver.implicitly_wait(5)
142171

143-
elems = driver.find_elements("xpath","//a[@href]")
172+
elems = driver.find_elements("xpath", "//a[@href]")
144173
labels = []
145174
labelIndex = len('https://etherscan.io/accounts/label/')
146175
for elem in elems:
@@ -174,19 +203,22 @@ def getAllLabels():
174203
# Proceed to combine all addresses into single JSON after retrieving all.
175204
combineAllJson()
176205

206+
177207
# Large size: Eth2/gnsos , Bugged: Liqui , NoData: Remaining labels
178208
ignore_list = ['eth2-depositor', 'gnosis-safe-multisig', 'liqui.io', 'education', 'electronics',
179-
'flashbots', 'media', 'music', 'network', 'prediction-market', 'real-estate', 'vpn', 'beacon-depositor','uniswap']
209+
'flashbots', 'media', 'music', 'network', 'prediction-market', 'real-estate', 'vpn', 'beacon-depositor', 'uniswap']
180210
with open('config.json', 'r') as f:
181211
config = json.load(f)
182212

183-
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
184213

185-
login()
186-
retrievalType = input('Enter retrieval type (single/all): ')
187-
if (retrievalType == 'all'):
188-
getAllLabels()
189-
else:
190-
singleLabel = input('Enter label of interest: ')
191-
label_type = input('Enter label type (account/token): ')
192-
getLabel(singleLabel, label_type)
214+
if __name__ == "__main__":
215+
driver = webdriver.Chrome(service=ChromeService(
216+
ChromeDriverManager().install()))
217+
login()
218+
retrievalType = input('Enter retrieval type (single/all): ')
219+
if (retrievalType == 'all'):
220+
getAllLabels()
221+
else:
222+
singleLabel = input('Enter label of interest: ')
223+
label_type = input('Enter label type (account/token): ')
224+
getLabel(singleLabel, label_type)

simpleCombineAllJson.py

+1-21
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,6 @@
11

22
import json
33
import os.path
4-
5-
# Combines all JSON into a single file combinedLabels.json
6-
def combineAllJson():
7-
combinedJSON = {}
8-
9-
# iterating over all files
10-
for files in os.listdir('./data'):
11-
if files.endswith('json'):
12-
print(files) # printing file name of desired extension
13-
with open('./data/{}'.format(files)) as f:
14-
dictData = json.load(f)
15-
for address, nameTag in dictData.items():
16-
if address not in combinedJSON:
17-
combinedJSON[address] = {'name': nameTag, 'labels': []}
18-
combinedJSON[address]['labels'].append(files[:-5])
19-
else:
20-
continue
21-
22-
with open('combined/combinedLabels.json', 'w', encoding='utf-8') as f:
23-
json.dump(combinedJSON, f, ensure_ascii=True)
24-
4+
from main import combineAllJson
255

266
combineAllJson()

0 commit comments

Comments
 (0)