Skip to content

Commit 2373dd6

Browse files
committed
fixing mlflow code and add functionalities
1 parent b477b26 commit 2373dd6

7 files changed

+74
-74
lines changed

initialize.py

+19-11
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
import pandas as pd
33
import lightgbm as lgb
44
import pickle
5-
import pdb
65
import warnings
6+
import argparse
77
import os
8+
import pdb
89

910
from pathlib import Path
10-
from train.train_hyperopt import LGBOptimizer
1111
from utils.preprocess_data import build_train
1212

1313

@@ -50,24 +50,32 @@ def download_data():
5050
df_test.to_csv(test_path)
5151

5252

53-
5453
def create_data_processor():
5554
print("creating preprocessor...")
5655
dataprocessor = build_train(TRAIN_PATH/'train.csv', DATAPROCESSORS_PATH)
5756

5857

59-
60-
def create_model():
58+
def create_model(hyper):
6159
print("creating model...")
6260
init_dataprocessor = 'dataprocessor_0_.p'
6361
dtrain = pickle.load(open(DATAPROCESSORS_PATH/init_dataprocessor, 'rb'))
62+
if hyper == "hyperopt":
63+
# from train.train_hyperopt import LGBOptimizer
64+
from train.train_hyperopt_mlflow import LGBOptimizer
65+
elif hyper == "hyperparameterhunter":
66+
# from train.train_hyperparameterhunter import LGBOptimizer
67+
from train.train_hyperparameterhunter_mlfow import LGBOptimizer
6468
LGBOpt = LGBOptimizer(dtrain, MODELS_PATH)
65-
LGBOpt.optimize(maxevals=10)
66-
# LGBOpt = LGBOptimizer(dtrain, str(MODELS_PATH))
67-
# LGBOpt.optimize('f1_score', StratifiedKFold, n_splits=3, maxevals=10)
69+
LGBOpt.optimize(maxevals=2)
70+
6871

6972
if __name__ == '__main__':
70-
# create_folders()
71-
# download_data()
73+
74+
parser = argparse.ArgumentParser()
75+
76+
parser.add_argument("--hyper", type=str, default="hyperopt")
77+
args = parser.parse_args()
78+
create_folders()
79+
download_data()
7280
create_data_processor()
73-
# create_model()
81+
create_model(args.hyper)

predictor.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
MODELS_PATH = PATH/'models'
1414
DATAPROCESSORS_PATH = PATH/'dataprocessors'
1515
MESSAGES_PATH = PATH/'messages'
16-
RETRAIN_EVERY = 20
16+
RETRAIN_EVERY = 25
1717
EXTRA_MODELS_TO_KEEP = 1
1818

1919
column_order = pickle.load(open(DATAPROCESSORS_PATH/'column_order.p', 'rb'))

train/train_hyperopt.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def __init__(self, trainDataset, out_dir):
8888
categorical_feature = self.categorical_columns,
8989
free_raw_data=False)
9090

91-
def optimize(self, maxevals=50, model_id=0):
91+
def optimize(self, maxevals=200, model_id=0):
9292

9393
param_space = self.hyperparameter_space()
9494
objective = self.get_objective(self.lgtrain)

train/train_hyperopt_mlflow.py

+11-16
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(self, trainDataset, out_dir):
9191
categorical_feature = self.categorical_columns,
9292
free_raw_data=False)
9393

94-
def optimize(self, maxevals=50, model_id=0):
94+
def optimize(self, maxevals=200, model_id=0, reuse_experiment=False):
9595

9696
param_space = self.hyperparameter_space()
9797
objective = self.get_objective(self.lgtrain)
@@ -106,22 +106,17 @@ def optimize(self, maxevals=50, model_id=0):
106106
best['num_leaves'] = int(best['num_leaves'])
107107
best['verbose'] = -1
108108

109-
# The next few lines are the only ones related to mlflow. One
110-
# "annoying" behaviour of mlflow is that when you instantiate a client
111-
# it creates the 'mlruns' dir by default as well as the first
112-
# experiment and there does not seem to be a way I can change this
113-
# behaviour without changing the source code. The solution is the
114-
# following hack:
109+
# The next few lines are the only ones related to mlflow.
115110
if not Path('mlruns').exists():
116-
client = MlflowClient()
117-
else:
118-
client = MlflowClient()
119-
n_experiments = len(client.list_experiments())
120-
experiment_name = 'experiment_' + str(n_experiments)
121-
client.create_experiment(name=experiment_name)
122-
experiments = client.list_experiments()
123-
with mlflow.start_run(experiment_id=experiments[-1].experiment_id) as run:
124-
# with mlflow.start_run() as run:
111+
# here set the tracking_uri. If None then http://localhost:5000
112+
client = MlflowClient()
113+
n_experiments=0
114+
elif not reuse_experiment:
115+
client = MlflowClient()
116+
n_experiments = len(client.list_experiments())
117+
experiment_name = 'experiment_' + str(n_experiments)
118+
client.create_experiment(name=experiment_name)
119+
with mlflow.start_run(experiment_id=n_experiments):
125120
model = lgb.LGBMClassifier(**best)
126121
model.fit(self.lgtrain.data,
127122
self.lgtrain.label,

train/train_hyperparameterhunter.py

+11-15
Original file line numberDiff line numberDiff line change
@@ -36,26 +36,29 @@ def __init__(self, trainDataset, out_dir):
3636
Path to the output directory
3737
"""
3838

39-
self.PATH = out_dir
39+
self.PATH = str(out_dir)
4040
self.data = trainDataset.data
4141
self.data['target'] = trainDataset.target
4242
self.colnames = trainDataset.colnames
4343
self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
4444

45-
def optimize(self, metrics, cv_type, n_splits, maxevals=200, do_predict_proba=None):
45+
def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold,
46+
maxevals=200, do_predict_proba=None, model_id=0):
4647

4748
params = self.hyperparameter_space()
4849
extra_params = self.extra_setup()
4950

5051
env = Environment(
5152
train_dataset=self.data,
52-
results_path=self.PATH,
53+
results_path='HyperparameterHunterAssets',
54+
# results_path=self.PATH,
5355
metrics=[metrics],
5456
do_predict_proba = do_predict_proba,
5557
cv_type=cv_type,
5658
cv_params=dict(n_splits=n_splits),
5759
)
5860

61+
# optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals)
5962
optimizer = opt.BayesianOptimization(iterations=maxevals)
6063
optimizer.set_experiment_guidelines(
6164
model_initializer=lgb.LGBMClassifier,
@@ -66,8 +69,7 @@ def optimize(self, metrics, cv_type, n_splits, maxevals=200, do_predict_proba=No
6669
# there are a few fixes on its way and the next few lines will soon be
6770
# one. At the moment, to access to the best parameters one has to read
6871
# from disc and access them
69-
best_experiment = self.PATH+\
70-
'/HyperparameterHunterAssets/Experiments/Descriptions/'+\
72+
best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\
7173
optimizer.best_experiment+'.json'
7274
with open(best_experiment) as best:
7375
best = json.loads(best.read())['hyperparameters']['model_init_params']
@@ -77,10 +79,11 @@ def optimize(self, metrics, cv_type, n_splits, maxevals=200, do_predict_proba=No
7779
feature_name=self.colnames,
7880
categorical_feature=self.categorical_columns
7981
)
80-
pickle.dump(model, open(self.PATH+'/HHmodel.p', 'wb'))
81-
pickle.dump(optimizer, open(self.PATH+'/HHoptimizer.p', 'wb'))
82+
model_fname = 'model_{}_.p'.format(model_id)
83+
best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
84+
pickle.dump(model, open('/'.join([self.PATH,model_fname]), 'wb'))
85+
pickle.dump(optimizer, open('/'.join([self.PATH,best_experiment_fname]), 'wb'))
8286

83-
return
8487

8588
def hyperparameter_space(self, param_space=None):
8689

@@ -114,10 +117,3 @@ def extra_setup(self, extra_setup=None):
114117
return extra_setup
115118
else:
116119
return extra_params
117-
118-
# if __name__ == '__main__':
119-
120-
# MD_PATH = Path('data/models/')
121-
# dtrain = pickle.load(open(MD_PATH/'preprocessor_0_.p', 'rb'))
122-
# HHOpt = HHOptimizer(dtrain, str(MD_PATH))
123-
# optimizer = HHOpt.optimize('f1_score', StratifiedKFold, n_splits=3, maxevals=3)

train/train_hyperparameterhunter_mlfow.py

+14-22
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,14 @@ def __init__(self, trainDataset, out_dir):
4040
Path to the output directory
4141
"""
4242

43-
self.PATH = out_dir
43+
self.PATH = str(out_dir)
4444
self.data = trainDataset.data
4545
self.data['target'] = trainDataset.target
4646
self.colnames = trainDataset.colnames
4747
self.categorical_columns = trainDataset.categorical_columns + trainDataset.crossed_columns
4848

49-
def optimize(self, metrics, cv_type, n_splits, maxevals=200, do_predict_proba=None):
49+
def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold,
50+
maxevals=200, do_predict_proba=None, model_id=0, reuse_experiment=False):
5051

5152
params = self.hyperparameter_space()
5253
extra_params = self.extra_setup()
@@ -78,20 +79,17 @@ def optimize(self, metrics, cv_type, n_splits, maxevals=200, do_predict_proba=No
7879
with open(best_experiment) as best:
7980
best = json.loads(best.read())['hyperparameters']['model_init_params']
8081

81-
# The next few lines are the only ones related to mlflow. One
82-
# "annoying" behaviour of mlflow is that when you instantiate a client
83-
# it creates the 'mlruns' dir by default as well as the first
84-
# experiment and there does not seem to be a way I can change this
85-
# behaviour without changing the source code. The solution is the
86-
# following hack:
82+
# The next few lines are the only ones related to mlflow
8783
if not Path('mlruns').exists():
84+
# here set the tracking_uri. If None then http://localhost:5000
8885
client = MlflowClient()
89-
else:
86+
n_experiments=0
87+
elif not reuse_experiment:
9088
client = MlflowClient()
9189
n_experiments = len(client.list_experiments())
92-
client.create_experiment(name=str(n_experiments))
93-
experiments = client.list_experiments()
94-
with mlflow.start_run(experiment_id=experiments[-1].experiment_id) as run:
90+
experiment_name = 'experiment_' + str(n_experiments)
91+
client.create_experiment(name=experiment_name)
92+
with mlflow.start_run(experiment_id=n_experiments):
9593
model = lgb.LGBMClassifier(**best)
9694
X, y = self.data.drop('target',axis=1), self.data.target
9795
model.fit(X,y,
@@ -103,10 +101,11 @@ def optimize(self, metrics, cv_type, n_splits, maxevals=200, do_predict_proba=No
103101
mlflow.log_metric('f1_score', -optimizer.optimizer_result.fun)
104102
mlflow.sklearn.log_model(model, "model")
105103

106-
pickle.dump(model, open(self.PATH+'/HHmodel.p', 'wb'))
107-
pickle.dump(optimizer, open(self.PATH+'/HHoptimizer.p', 'wb'))
104+
model_fname = 'model_{}_.p'.format(model_id)
105+
best_experiment_fname = 'best_experiment_{}_.p'.format(model_id)
106+
pickle.dump(model, open('/'.join([self.PATH,model_fname]), 'wb'))
107+
pickle.dump(optimizer, open('/'.join([self.PATH,best_experiment_fname]), 'wb'))
108108

109-
return
110109

111110
def hyperparameter_space(self, param_space=None):
112111

@@ -139,10 +138,3 @@ def extra_setup(self, extra_setup=None):
139138
return extra_setup
140139
else:
141140
return extra_params
142-
143-
# if __name__ == '__main__':
144-
145-
# MD_PATH = Path('data/models/')
146-
# dtrain = pickle.load(open(MD_PATH/'preprocessor_0_.p', 'rb'))
147-
# HHOpt = HHOptimizer(dtrain, str(MD_PATH))
148-
# optimizer = HHOpt.optimize('f1_score', StratifiedKFold, n_splits=3, maxevals=3)

trainer.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,13 @@
22
import json
33
import pandas as pd
44
import pickle
5+
import argparse
56

67
from pathlib import Path
78
from kafka import KafkaConsumer
89

910
from utils.messages_utils import publish_traininig_completed
1011
from utils.preprocess_data import build_train
11-
from train.train_hyperopt import LGBOptimizer
12-
# uncomment line below to use hyperparameterhunter
13-
# from train.train_hyperparameterhunter import LGBOptimizer
1412

1513

1614
KAFKA_HOST = 'localhost:9092'
@@ -22,15 +20,21 @@
2220
MESSAGES_PATH = PATH/'messages'
2321

2422

25-
def train(model_id, messages):
23+
def train(model_id, messages, hyper):
2624
print("RETRAINING STARTED (model id: {})".format(model_id))
2725
dtrain = build_train(TRAIN_DATA, DATAPROCESSORS_PATH, model_id, messages)
26+
if hyper == "hyperopt":
27+
# from train.train_hyperopt import LGBOptimizer
28+
from train.train_hyperopt_mlflow import LGBOptimizer
29+
elif hyper == "hyperparameterhunter":
30+
# from train.train_hyperparameterhunter import LGBOptimizer
31+
from train.train_hyperparameterhunter_mlfow import LGBOptimizer
2832
LGBOpt = LGBOptimizer(dtrain, MODELS_PATH)
29-
LGBOpt.optimize(maxevals=10, model_id=model_id)
33+
LGBOpt.optimize(maxevals=2, model_id=model_id)
3034
print("RETRAINING COMPLETED (model id: {})".format(model_id))
3135

3236

33-
def start():
37+
def start(hyper):
3438
consumer = KafkaConsumer(RETRAIN_TOPIC, bootstrap_servers=KAFKA_HOST)
3539

3640
for msg in consumer:
@@ -41,9 +45,14 @@ def start():
4145
message_fname = 'messages_{}_.txt'.format(batch_id)
4246
messages = MESSAGES_PATH/message_fname
4347

44-
train(model_id, messages)
48+
train(model_id, messages, hyper)
4549
publish_traininig_completed(model_id)
4650

4751

4852
if __name__ == '__main__':
49-
start()
53+
parser = argparse.ArgumentParser()
54+
55+
parser.add_argument("--hyper", type=str, default="hyperopt")
56+
args = parser.parse_args()
57+
58+
start(args.hyper)

0 commit comments

Comments
 (0)