Skip to content

Commit 96cabca

Browse files
authored
Fixes o19s#4 - Enhancement/explicitize (o19s#37)
* Remove TMDB from top level rebuild * add repr for Judgment * Which tmdb json for movies * make downloads explicit per directory * Forgot download fixes * Take file obj, not filename * setup to explicit reset, create feature set * Fix notebook judgment opening * Missed a setup -> explicit call * Fix netfix notebooks judg. access * Stupid paren * Fix feature creation on need a bigger bot * change feature creation in netfix ES * wrong featureset used * filename to file obj * correct feature store name * Correct feature config name * Download ranky before training * Remove breakpoint * Remove special download functions
1 parent 75769e5 commit 96cabca

33 files changed

+636
-345
lines changed

ltr/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@
33
from .download import download
44
from .evaluate import evaluate, rre_table
55
from .train import train
6-
from .setup import setup
76
from .search import search

ltr/client/base_client.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def reset_ltr(self, index):
3535
pass
3636

3737
@abstractmethod
38-
def create_featureset(self, index, name, config):
38+
def create_featureset(self, index, name, ftr_config):
3939
pass
4040

4141
@abstractmethod

ltr/client/elastic_client.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ def reset_ltr(self, index):
9393
resp = requests.put(self.elastic_ep)
9494
resp_msg(msg="Initialize Default LTR feature store".format(), resp=resp)
9595

96-
def create_featureset(self, index, name, config):
97-
resp = requests.post('{}/_featureset/{}'.format(self.elastic_ep, name), json=config)
96+
def create_featureset(self, index, name, ftr_config):
97+
resp = requests.post('{}/_featureset/{}'.format(self.elastic_ep, name), json=ftr_config)
9898
resp_msg(msg="Create {} feature set".format(name), resp=resp)
9999

100100
def log_query(self, index, featureset, ids, params={}):

ltr/client/solr_client.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -90,10 +90,10 @@ def validate_featureset(self, name, config):
9090
if 'store' not in feature or feature['store'] != name:
9191
raise ValueError("Feature {} needs to be created with \"store\": \"{}\" ".format(feature['name'], name))
9292

93-
def create_featureset(self, index, name, config):
94-
self.validate_featureset(name, config)
93+
def create_featureset(self, index, name, ftr_config):
94+
self.validate_featureset(name, ftr_config)
9595
resp = requests.put('{}/{}/schema/feature-store'.format(
96-
self.solr_base_ep, index, name), json=config)
96+
self.solr_base_ep, index, name), json=ftr_config)
9797
resp_msg(msg='Created {} feature store under {}:'.format(name, index), resp=resp)
9898

9999

ltr/date_genre_judgments.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ def synthesize(client, judgmentsOutFile='genre_by_date_judgments.txt', autoNegat
9898
docId=movie['id'],
9999
keywords=negGenre)
100100
judgments.append(judgment)
101-
102-
judgments_to_file(judgmentsOutFile, judgmentsList=judgments)
101+
with open(judgmentsOutFile, 'w') as f:
102+
judgments_to_file(f, judgmentsList=judgments)
103103

104104
print('Done')
105105
return judgments

ltr/download.py

+14-35
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,22 @@
11
import requests
22
from os import path
33

4-
def download_uri(uri):
4+
def download_one(uri, dest='data/', force=False):
55
import os
66

7-
if not os.path.exists('data'):
8-
os.makedirs('data')
7+
if not os.path.exists(dest):
8+
os.makedirs(dest)
9+
10+
if not os.path.isdir(dest):
11+
raise ValueError("dest {} is not a directory".format(dest))
912

1013
filename = uri[uri.rfind('/') + 1:]
11-
filepath = 'data/{}'.format(filename)
14+
filepath = os.path.join(dest, filename)
1215
if path.exists(filepath):
13-
print(filepath + ' already exists')
14-
return
16+
if not force:
17+
print(filepath + ' already exists')
18+
return
19+
print("exists but force=True, Downloading anyway")
1520

1621
with open(filepath, 'wb') as out:
1722
print('GET {}'.format(uri))
@@ -20,32 +25,6 @@ def download_uri(uri):
2025
if chunk:
2126
out.write(chunk)
2227

23-
24-
def download():
25-
resources = [
26-
'http://es-learn-to-rank.labs.o19s.com/tmdb.json',
27-
'http://es-learn-to-rank.labs.o19s.com/blog.jsonl',
28-
'http://es-learn-to-rank.labs.o19s.com/osc_judgments.txt',
29-
'http://es-learn-to-rank.labs.o19s.com/RankyMcRankFace.jar',
30-
'http://es-learn-to-rank.labs.o19s.com/title_judgments.txt',
31-
'http://es-learn-to-rank.labs.o19s.com/title_judgments_binary.txt',
32-
'http://es-learn-to-rank.labs.o19s.com/genome_judgments.txt',
33-
'http://es-learn-to-rank.labs.o19s.com/sample_judgments_train.txt'
34-
]
35-
36-
for uri in resources:
37-
download_uri(uri)
38-
39-
print('Done.')
40-
41-
def download_msmarco():
42-
resources = [
43-
'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz',
44-
'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs-lookup.tsv.gz',
45-
'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz',
46-
'https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz']
47-
for uri in resources:
48-
download_uri(uri)
49-
50-
print('Done.')
51-
28+
def download(uris, dest='data/', force=False):
29+
for uri in uris:
30+
download_one(uri=uri, dest=dest, force=force)

ltr/helpers/movies.py

+7-10
Original file line numberDiff line numberDiff line change
@@ -13,23 +13,22 @@ def __call__(self, *args):
1313
return self.memo[args]
1414

1515
@Memoize
16-
def load_movies():
17-
return json.load(open('data/tmdb.json'))
16+
def load_movies(json_path):
17+
return json.load(open(json_path))
1818

19-
def get_movie(tmdb_id):
20-
movies = load_movies()
19+
def get_movie(tmdb_id, movies='data/tmdb.json'):
20+
movies = load_movies(movies)
2121
tmdb_id=str(tmdb_id)
2222
return movies[tmdb_id]
2323

24-
2524
def noop(src_movie, base_doc):
2625
return base_doc
2726

2827

29-
def indexable_movies(enrich=noop):
28+
def indexable_movies(enrich=noop, movies='data/tmdb.json'):
3029
""" Generates TMDB movies, similar to how ES Bulk indexing
31-
uses a generator to generate bulk index/update actions """
32-
movies = load_movies()
30+
uses a generator to generate bulk index/update actions"""
31+
movies = load_movies(movies)
3332
idx = 0
3433
for movieId, tmdbMovie in movies.items():
3534
try:
@@ -61,5 +60,3 @@ def indexable_movies(enrich=noop):
6160
idx += 1
6261
except KeyError as k: # Ignore any movies missing these attributes
6362
continue
64-
65-

ltr/index.py

-5
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,3 @@ def rebuild(client, index, doc_src):
1717
doc_src=doc_src)
1818

1919
print('Done')
20-
21-
22-
def rebuild_tmdb(client, enrich=noop):
23-
movies=indexable_movies(enrich=enrich)
24-
rebuild(client, index='tmdb', doc_src=movies)

ltr/injectTypos.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88

99

1010
def typoIt(judgmentInFile, judgmentOutFile, rounds=100):
11-
currJudgments = [judg for judg in judgments_from_file(judgmentInFile)]
11+
with open(judgmentInFile) as f:
12+
currJudgments = [judg for judg in judgments_from_file(f)]
1213
lastQid = currJudgments[-1].qid
1314
judgDict = judgments_by_qid(currJudgments)
1415

@@ -32,7 +33,8 @@ def typoIt(judgmentInFile, judgmentOutFile, rounds=100):
3233
currJudgments.append(typoJudg)
3334
existingTypos.add(keywordsWTypo)
3435

35-
judgments_to_file(filename=judgmentOutFile, judgmentsList=currJudgments)
36+
with open(judgmentOutFile, 'w') as f:
37+
judgments_to_file(f, judgmentsList=currJudgments)
3638

3739

3840
if __name__ == "__main__":

ltr/judgments.py

+27-21
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ def has_features(self):
1818
def __str__(self):
1919
return "grade:%s qid:%s (%s) docid:%s" % (self.grade, self.qid, self.keywords, self.docId)
2020

21+
def __repr__(self):
22+
return "Judgment(grade={grade},qid={qid},keywords={keywords},docId={docId},features={features},weight={weight}".format(**vars(self))
23+
2124
def toRanklibFormat(self):
2225
featuresAsStrs = ["%s:%s" % (idx+1, feature) for idx, feature in enumerate(self.features)]
2326
comment = "# %s\t%s" % (self.docId, self.keywords)
@@ -110,30 +113,33 @@ def _judgmentsFromBody(lines):
110113
#print("Not Recognized as Judgment %s" % line)
111114

112115

113-
def judgments_from_file(filename):
114-
with open(filename) as f:
115-
qidToKeywords = _queriesFromHeader(f)
116-
with open(filename) as f:
117-
lastQid = -1
118-
for grade, qid, docId, features in _judgmentsFromBody(f):
119-
if lastQid != qid and qid % 100 == 0:
120-
print("Parsing QID %s" % qid)
121-
yield Judgment(grade=grade, qid=qid,
122-
keywords=qidToKeywords[qid][0],
123-
weight=qidToKeywords[qid][1],
124-
docId=docId,
125-
features=features)
126-
lastQid = qid
127-
128-
129-
def judgments_to_file(filename, judgmentsList):
116+
def judgments_from_file(f):
117+
""" Read judgments from a SVMRank File
118+
f is a file object
119+
"""
120+
qidToKeywords = _queriesFromHeader(f)
121+
lastQid = -1
122+
for grade, qid, docId, features in _judgmentsFromBody(f):
123+
if lastQid != qid and qid % 100 == 0:
124+
print("Parsing QID %s" % qid)
125+
yield Judgment(grade=grade, qid=qid,
126+
keywords=qidToKeywords[qid][0],
127+
weight=qidToKeywords[qid][1],
128+
docId=docId,
129+
features=features)
130+
lastQid = qid
131+
132+
133+
def judgments_to_file(f, judgmentsList):
134+
""" Write judgments from a SVMRank File
135+
f is a file object
136+
"""
130137
judgToQid = judgments_by_qid(judgmentsList) #Pretty hideosly slow stuff
131138
fileHeader = _queriesToHeader({qid: (judgs[0].keywords, judgs[0].weight) for qid, judgs in judgToQid.items()})
132139
judgByQid = sorted(judgmentsList, key=lambda j: j.qid)
133-
with open(filename, 'w+') as f:
134-
f.write(fileHeader)
135-
for judg in judgByQid:
136-
f.write(judg.toRanklibFormat() + '\n')
140+
f.write(fileHeader)
141+
for judg in judgByQid:
142+
f.write(judg.toRanklibFormat() + '\n')
137143

138144

139145

ltr/log.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,10 @@ def log_features(client, index, judgments_by_qid, featureSet):
5757
def judgments_to_training_set(client, judgmentInFile, featureSet, trainingOutFile='judgments_wfeatures.txt', index='tmdb'):
5858
from .judgments import judgments_to_file, judgments_from_file, judgments_by_qid
5959

60-
judgments = judgments_from_file(judgmentInFile)
61-
judgments = judgments_by_qid(judgments)
60+
judgments = []
61+
with open(judgmentInFile) as f:
62+
judgments = judgments_from_file(f)
63+
judgments = judgments_by_qid(judgments)
6264
log_features(client, index, judgments, featureSet=featureSet)
6365

6466
judgmentsAsList = []
@@ -71,5 +73,6 @@ def judgments_to_training_set(client, judgmentInFile, featureSet, trainingOutFil
7173
discarded.append(judgment)
7274
print("Discarded %s Keep %s" % (len(discarded), len(judgmentsAsList)))
7375

74-
judgments_to_file(filename=trainingOutFile, judgmentsList=judgmentsAsList)
76+
with open(trainingOutFile, 'w+') as f:
77+
judgments_to_file(f, judgmentsList=judgmentsAsList)
7578
return judgments

ltr/setup.py

-3
This file was deleted.

ltr/train.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,14 @@
11
import os
22
from ltr.helpers.ranklib_result import parse_training_log
3+
from ltr import download
4+
5+
def check_for_rankymcrankface():
6+
""" Ensure ranky jar is in a temp dir somewhere..."""
7+
ranky_url='http://es-learn-to-rank.labs.o19s.com/RankyMcRankFace.jar'
8+
import tempfile
9+
tempdir = tempfile.gettempdir()
10+
download([ranky_url], dest=tempdir, force=False)
11+
return os.path.join(tempdir, 'RankyMcRankFace.jar')
312

413

514
def trainModel(training, out, features=None, kcv=None, ranker=6,
@@ -15,8 +24,9 @@ def trainModel(training, out, features=None, kcv=None, ranker=6,
1524
srate - what proportion of the queries should be examined for each ensemble
1625
"""
1726

18-
cmd = 'java -jar data/RankyMcRankFace.jar -ranker {} -shrinkage {} -metric2t {} -tree {} -bag {} -leaf {} -frate {} -srate {} -train {} -save {} '.format(
19-
ranker, shrinkage, metric2t, trees, bag, leafs, frate, srate, training, out)
27+
ranky_loc = check_for_rankymcrankface()
28+
cmd = 'java -jar {} -ranker {} -shrinkage {} -metric2t {} -tree {} -bag {} -leaf {} -frate {} -srate {} -train {} -save {} '.format(
29+
ranky_loc, ranker, shrinkage, metric2t, trees, bag, leafs, frate, srate, training, out)
2030

2131
if features is not None:
2232
with open('data/features.txt', 'w') as f:

notebooks/elasticsearch/osc-blog/osc-blog.ipynb

+7-4
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
"outputs": [],
1515
"source": [
1616
"from ltr import download\n",
17-
"download();"
17+
"corpus='http://es-learn-to-rank.labs.o19s.com/blog.jsonl'\n",
18+
"judgments='http://es-learn-to-rank.labs.o19s.com/osc_judgments.txt'\n",
19+
"\n",
20+
"download([corpus, judgments], dest='data/');"
1821
]
1922
},
2023
{
@@ -89,6 +92,8 @@
8992
"metadata": {},
9093
"outputs": [],
9194
"source": [
95+
"client.reset_ltr(index='tmdb')\n",
96+
"\n",
9297
"config = {\n",
9398
" \"featureset\": {\n",
9499
" \"features\": [\n",
@@ -226,9 +231,7 @@
226231
" }\n",
227232
"}\n",
228233
"\n",
229-
"\n",
230-
"from ltr import setup\n",
231-
"setup(client, config=config, index='blog', featureset='test')"
234+
"client.create_featureset(index='blog', name='test', ftr_config=config)"
232235
]
233236
},
234237
{

notebooks/elasticsearch/tmdb/es-ltr-basics-project.ipynb

+12-7
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717
"outputs": [],
1818
"source": [
1919
"from ltr import download\n",
20-
"download();"
20+
"corpus='http://es-learn-to-rank.labs.o19s.com/tmdb.json'\n",
21+
"judgments='http://es-learn-to-rank.labs.o19s.com/title_judgments.txt'\n",
22+
"\n",
23+
"download([corpus, judgments], dest='data/');"
2124
]
2225
},
2326
{
@@ -26,8 +29,11 @@
2629
"metadata": {},
2730
"outputs": [],
2831
"source": [
29-
"from ltr.index import rebuild_tmdb\n",
30-
"rebuild_tmdb(client)"
32+
"from ltr.index import rebuild\n",
33+
"from ltr.helpers.movies import indexable_movies\n",
34+
"\n",
35+
"movies=indexable_movies(movies='data/tmdb.json')\n",
36+
"rebuild(client, index='tmdb', doc_src=movies)"
3137
]
3238
},
3339
{
@@ -36,6 +42,8 @@
3642
"metadata": {},
3743
"outputs": [],
3844
"source": [
45+
"client.reset_ltr(index='tmdb')\n",
46+
"\n",
3947
"config = {\"validation\": {\n",
4048
" \"index\": \"tmdb\",\n",
4149
" \"params\": {\n",
@@ -63,10 +71,7 @@
6371
" }}\n",
6472
"\n",
6573
"\n",
66-
"\n",
67-
"\n",
68-
"from ltr import setup\n",
69-
"setup(client, config=config, index='tmdb', featureset='movies')"
74+
"client.create_featureset(index='tmdb', name='movies', ftr_config=config)"
7075
]
7176
},
7277
{

notebooks/elasticsearch/tmdb/evaluation.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
"name": "python",
9292
"nbconvert_exporter": "python",
9393
"pygments_lexer": "ipython3",
94-
"version": "3.6.5"
94+
"version": "3.6.1"
9595
}
9696
},
9797
"nbformat": 4,

0 commit comments

Comments
 (0)