-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathhash_all.py
143 lines (109 loc) · 5.8 KB
/
hash_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
"""
Walks the submission directory and creates a parallel directory of
the tokenized files.
"""
import argparse
import os
import json
import hashlib
from pathlib import Path
import humanize
import datetime
def parse_args():
parser = argparse.ArgumentParser(description="")
parser.add_argument("basepath")
return parser.parse_args()
def hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file):
language = lichen_run_config["language"]
hash_size = int(lichen_run_config["hash_size"])
data_json_path = Path(Path(__file__).resolve().parent.parent,
"tokenizer", "tokenizer_config.json")
with open(data_json_path) as token_data_file:
token_data = json.load(token_data_file)
with open(my_tokenized_file, 'r', encoding='ISO-8859-1') as my_tf:
with open(my_hashes_file, 'w') as my_hf:
tokens = json.load(my_tf)
# write empty hashes file if the tokens file was empty (such as
# when there is no provided code)
if tokens is not None:
token_values = [str(x[token_data[language]["token_value"]]) for x in tokens]
num = len(tokens)
# FIXME: this truncation should be adjusted after testing
token_hashed_values = [(hashlib.md5(''.join(
token_values[x:x+hash_size]).encode())
.hexdigest())[0:8] for x in range(0, num-hash_size+1)]
if len(token_hashed_values) > lichen_config["max_sequences_per_file"]:
token_hashed_values = token_hashed_values[slice(0, lichen_config["max_sequences_per_file"])] # noqa E501
print(f"File {my_hashes_file} truncated after exceeding max sequence limit")
my_hf.write('\n'.join(token_hashed_values))
def main():
start_time = datetime.datetime.now()
args = parse_args()
with open(Path(args.basepath, "config.json")) as lichen_run_config_file:
lichen_run_config = json.load(lichen_run_config_file)
with open(Path(Path(__file__).resolve().parent.parent,
'bin', 'lichen_config.json')) as lichen_config_file:
lichen_config = json.load(lichen_config_file)
print("HASH ALL:", flush="True")
print("[0% 25% 50% 75% 100%]\n[", end="", flush=True) # noqa: E501
users_dir = os.path.join(args.basepath, "users")
if not os.path.isdir(users_dir):
raise SystemExit("ERROR: Unable to find users directory")
other_gradeables_dir = os.path.join(args.basepath, "other_gradeables")
if not os.path.isdir(other_gradeables_dir):
raise SystemExit("ERROR: Unable to find other gradeables directory")
total_users = len(os.listdir(users_dir))
for dir in os.listdir(other_gradeables_dir):
total_users += len(os.listdir(os.path.join(other_gradeables_dir, dir)))
users_hashed = 0
percent_progress = 0
# ==========================================================================
# walk the subdirectories of this gradeable
for user in sorted(os.listdir(users_dir)):
user_dir = Path(users_dir, user)
if not os.path.isdir(user_dir):
continue
for version in sorted(os.listdir(user_dir)):
my_dir = Path(user_dir, version)
if not os.path.isdir(my_dir):
continue
my_tokenized_file = Path(my_dir, "tokens.json")
my_hashes_file = Path(my_dir, "hashes.txt")
hasher(lichen_config, lichen_run_config, my_tokenized_file, my_hashes_file)
users_hashed += 1
if int((users_hashed / total_users) * 100) > percent_progress:
new_percent_progress = int((users_hashed / total_users) * 100)
print("|" * (new_percent_progress - percent_progress), end="", flush=True)
percent_progress = new_percent_progress
# ==========================================================================
# walk the subdirectories of the other gradeables
for other_gradeable in sorted(os.listdir(other_gradeables_dir)):
other_gradeable_dir = Path(other_gradeables_dir, other_gradeable)
if not os.path.isdir(other_gradeable_dir):
continue
for other_user in sorted(os.listdir(other_gradeable_dir)):
other_user_dir = Path(other_gradeable_dir, other_user)
if not os.path.isdir(other_user_dir):
continue
for other_version in sorted(os.listdir(other_user_dir)):
other_version_dir = Path(other_user_dir, other_version)
if not os.path.isdir(other_version_dir):
continue
other_tokenized_file = Path(other_version_dir, "tokens.json")
other_hashes_file = Path(other_version_dir, "hashes.txt")
hasher(lichen_config, lichen_run_config, other_tokenized_file, other_hashes_file)
users_hashed += 1
if int((users_hashed / total_users) * 100) > percent_progress:
new_percent_progress = int((users_hashed / total_users) * 100)
print("|" * (new_percent_progress - percent_progress), end="", flush=True)
percent_progress = new_percent_progress
# ==========================================================================
# hash the provided code
provided_code_tokenized = Path(args.basepath, "provided_code", "tokens.json")
provided_code_hashed = Path(args.basepath, "provided_code", "hashes.txt")
hasher(lichen_config, lichen_run_config, provided_code_tokenized, provided_code_hashed)
# ==========================================================================
print("]\nHashing done in", humanize.precisedelta(start_time, format="%1.f"))
if __name__ == "__main__":
main()