-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
91 lines (68 loc) · 3.06 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import argparse
import os
import pandas as pd
import time
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
if __name__ == "__main__":
print("training model...")
# Time execution
start_time = time.time()
# Call mlflow autolog
mlflow.sklearn.autolog() # We won't log models right away
# Parse arguments given in shell script
parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", default=1)
parser.add_argument("--min_samples_split", default=2)
args = parser.parse_args()
# Import dataset
df = pd.read_csv("https://full-stack-assets.s3.eu-west-3.amazonaws.com/Deployment/doctolib_simplified_dataset_01.csv")
# X, y split
X = df.iloc[:, 3:-1]
y = df.iloc[:, -1].apply(lambda x: 0 if x=="No" else 1)
# Train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(df.columns)
# Preprocessing
def date_processing(df):
df = df.copy()
## Transform datetime into a number
df["ScheduledDay"] = pd.to_datetime(df["ScheduledDay"], yearfirst=True )
df["AppointmentDay"] = pd.to_datetime(df["AppointmentDay"], yearfirst=True )
## Get the difference between scheduled day and appointment
df["time_difference_between_scheduled_and_appointment"] = (df["AppointmentDay"] - df["ScheduledDay"]).dt.days
## Remove redundant info
df = df.drop(["ScheduledDay", "AppointmentDay"], axis=1)
return df
date_preprocessor = FunctionTransformer(date_processing)
# Preprocessing
X_train_after_date_processing = date_processing(X_train)
categorical_features = X_train_after_date_processing.select_dtypes("object").columns # Select all the columns containing strings
categorical_transformer = OneHotEncoder(drop='first')
numerical_features = X_train_after_date_processing.select_dtypes("number").columns
numerical_transformer = StandardScaler()
feature_preprocessor = ColumnTransformer(
transformers=[
("categorical_transformer", categorical_transformer, categorical_features),
("numerical_transformer", numerical_transformer, numerical_features)
]
)
# Pipeline
n_estimators = int(args.n_estimators)
min_samples_split=int(args.min_samples_split)
model = Pipeline(steps=[
("Dates_preprocessing", date_preprocessor),
('features_preprocessing', feature_preprocessor),
("Regressor",RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split))
])
# Log experiment to MLFlow
with mlflow.start_run() as run:
model.fit(X_train, y_train)
predictions = model.predict(X_train)
print("...Done!")
print(f"---Total training time: {time.time()-start_time}")