training lightgbm [baseline model]

ronylpatil · Oct 5, 2024 · 122754c · 122754c
1 parent e281db2
commit 122754c
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 9 deletions.
diff --git a/dvc.lock b/dvc.lock
@@ -94,8 +94,8 @@ stages:
       size: 1932798
     - path: ./src/models/train_model.py
       hash: md5
-      md5: 90f6f48a40e04b569de4dfe36fc48c71
-      size: 16665
+      md5: b082ca07b5418b81dee5bd4293f88c85
+      size: 20270
     params:
       params.yaml:
         base.target: Time_taken(min)
@@ -120,8 +120,8 @@ stages:
           max_features: 0.7
           max_leaf_nodes: 50
           n_iter_no_change: 13
-        train_model.model_name: model_xgb
-        train_model.model_to_train: xgb
+        train_model.model_name: model_lgbm
+        train_model.model_to_train: lightgbm
         train_model.random_forest:
           n_estimators: 100
           criterion: squared_error
@@ -141,10 +141,10 @@ stages:
           lambda: 1
           alpha: 0
     outs:
-    - path: ./models/model_xgb.joblib
+    - path: ./models/model_lgbm.joblib
       hash: md5
-      md5: 5b3624e0f22c4ef3ad891b53e5aac0fc
-      size: 480723
+      md5: 7838430e1973c7ca9572297fef4387d4
+      size: 287673
   tune_model:
     cmd: python ./src/models/tune_model.py
     deps:

diff --git a/models/.gitignore b/models/.gitignore
@@ -1,3 +1,4 @@
 /model_rf.joblib
 /xgb_tunned.joblib
 /model_xgb.joblib
+/model_lgbm.joblib
diff --git a/params.yaml b/params.yaml
@@ -30,8 +30,8 @@ mlflow:
 
 train_model:
   # {decision_tree: 'dt', random_forest: 'rf', gradient boost: 'gb', xgboost: 'xgb', all models: 'all'}
-  model_to_train: xgb
-  model_name: model_xgb        # IMP PLEASE CHANGE NAME
+  model_to_train: lightgbm
+  model_name: model_lgbm        # IMP PLEASE CHANGE NAME
 
   decision_tree:
     criterion: squared_error

diff --git a/src/models/train_model.py b/src/models/train_model.py
@@ -13,6 +13,7 @@
 import inspect
 import pathlib
 import pandas as pd
+import lightgbm as lgb
 from datetime import datetime
 from sklearn.base import BaseEstimator
 from xgboost import XGBRegressor  # type: ignore
@@ -350,6 +351,87 @@ def train_model(
             model=model_xgb, model_dir=model_dir, model_name=params["model_name"]
         )
 
+    if model_to_train == "lightgbm":
+
+        try:
+            model_lgbm = lgb.LGBMRegressor()  # ___change___
+            model_lgbm.fit(x_train, y_train)
+            infologger.info("[STEP-1] LightGBM fitted successfully.")  # ___change___
+        except Exception as e:
+            infologger.error(  # ___change___
+                f"Failed to initilize LightGBM. [Error: {e}]. \n[File: {pathlib.Path(__file__)}]\n[Method: {inspect.currentframe().f_code.co_name}]"
+            )
+
+        y_pred = model_lgbm.predict(x_test)
+        mae = mean_absolute_error(y_test, y_pred)
+        mse = mean_squared_error(y_test, y_pred)
+        r2_ = r2_score(y_test, y_pred)
+        adj_r2 = adj_r2_score(r2_, x_train.shape[0], x_train.shape[1])
+
+        # setting MLflow
+        mlflow.set_experiment("DTE [LGBM]")  # ___change___
+        experiment_description = "Training lightgbm regressor."  # adding experiment description             # ___change___
+        mlflow.set_experiment_tag("mlflow.note.content", experiment_description)
+
+        with mlflow.start_run(description="Training LightGBM - ronil"):  # ___change___
+            # mlflow.log_params(params["xgb"])
+            mlflow.log_metrics(
+                {
+                    "mae": round(mae, 3),
+                    "mse": round(mse, 3),
+                    "r2_score": round(r2_, 3),
+                    "adj_r2": round(adj_r2, 3),
+                }
+            )
+
+            curr_time = datetime.now().strftime("%d%m%y-%H%M%S")
+            file_name = f"{home_dir}/figures/{curr_time}.png"
+            residual_plot(y_test=y_test, y_pred=y_pred, path=file_name)
+            mlflow.log_artifact(file_name, "residual plot")
+            mlflow.log_artifact(__file__)  # loging code with mlflow
+
+            # custom model signature
+            input_schema = Schema(
+                [
+                    ColSpec("integer", "Age"),
+                    ColSpec("float", "Ratings"),
+                    ColSpec("integer", "Weatherconditions"),
+                    ColSpec("integer", "Road_traffic_density"),
+                    ColSpec("integer", "Vehicle_condition"),
+                    ColSpec("integer", "Type_of_order"),
+                    ColSpec("integer", "Type_of_vehicle"),
+                    ColSpec("integer", "multiple_deliveries"),
+                    ColSpec("integer", "Festival"),
+                    ColSpec("integer", "City"),
+                    ColSpec("float", "haversine_dist"),
+                    ColSpec("float", "estm_time"),
+                    ColSpec("float", "time_lag"),
+                    ColSpec("float", "hour"),
+                    ColSpec("integer", "day"),
+                    ColSpec("integer", "is_weekend"),
+                    ColSpec("integer", "is_rush"),
+                ]
+            )
+
+            # Define a custom output schema
+            output_schema = Schema([ColSpec("float", "Time_taken(min)")])
+
+            # Create a signature
+            signature = ModelSignature(inputs=input_schema, outputs=output_schema)
+
+            mlflow.sklearn.log_model(
+                model_lgbm, "lightgbm", signature=signature
+            )  # ___change___
+            mlflow.set_tag("developer", "ronil")
+            mlflow.set_tag("model", "lightgbm")  # ___change___
+            infologger.info("[STEP-2] Experiment tracked successfully.")
+
+        save_model(
+            model=model_lgbm,
+            model_dir=model_dir,
+            model_name=params["model_name"],  # ___change___
+        )
+
 
 def save_model(model: BaseEstimator, model_dir: str, model_name: str) -> None:
     try: