Skip to content

Commit 45274d9

Browse files
authored
Implementation of gamma forecasting. (#77)
- Implement Gamma regression with two-parameter prediction. - Add special functions.
1 parent cebfd36 commit 45274d9

20 files changed

+846
-59
lines changed

docs/source/api/metrics.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Metrics
2-
====================
2+
=======
33

44
.. autoclass:: legateboost.BaseMetric
55
:members:
@@ -16,6 +16,9 @@ Metrics
1616
.. autoclass:: legateboost.GammaDevianceMetric
1717
:members:
1818

19+
.. autoclass:: legateboost.GammaLLMetric
20+
:members:
21+
1922
.. autoclass:: legateboost.QuantileMetric
2023
:members:
2124

docs/source/api/objectives.rst

+3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ Objectives
1313
.. autoclass:: legateboost.GammaDevianceObjective
1414
:members:
1515

16+
.. autoclass:: legateboost.GammaObjective
17+
:members:
18+
1619
.. autoclass:: legateboost.QuantileObjective
1720
:members:
1821

examples/probabalistic_regression/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ Running the example produces a gif animation. This animation shows the boosting
88

99
The shaded area on the left hand figure shows the 95% confidence interval. The right hand example shows different quantile values.
1010

11-
Notice that the normal distribution is symmetric about the mean, while the data is somewhat skewed. To better fit the data we can also use quantile regression, which is able to model the skewed distribution of the data.
11+
Notice that the normal distribution is symmetric about the mean, while the data is somewhat skewed. To better fit the data we can also use quantile regression, which is able to model the skewed distribution of the data. A Gamma distribution is another possibility that can fit well for skewed, strictly positive data.
Loading

examples/probabalistic_regression/probabilistic_regression.py

+64-14
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
import os
24
from pathlib import Path
35

@@ -6,9 +8,11 @@
68
import pandas as pd
79
import seaborn as sns
810
from matplotlib import animation
9-
from scipy.stats import norm
11+
from matplotlib.figure import Figure
12+
from scipy.stats import gamma, norm
1013
from sklearn.datasets import fetch_california_housing
1114

15+
import cunumeric as cn
1216
import legateboost as lb
1317

1418
sns.set()
@@ -26,23 +30,38 @@
2630
n_frames = 2 if os.environ.get("CI") else 40
2731

2832

29-
def fit_normal_distribution():
33+
def fit_normal_distribution() -> tuple[lb.LBRegressor, list[cn.ndarray]]:
34+
obj = lb.NormalObjective()
3035
model = lb.LBRegressor(
3136
verbose=True,
3237
init="average",
3338
base_models=(lb.models.Tree(max_depth=2),),
3439
n_estimators=n_estimators,
3540
learning_rate=0.1,
3641
random_state=rs,
37-
objective="normal",
42+
objective=obj,
43+
)
44+
return model, [model.partial_fit(X, y).predict(X_test) for _ in range(n_frames)]
45+
46+
47+
def fit_gamma_distribution() -> tuple[lb.LBRegressor, list[cn.ndarray]]:
48+
obj = lb.GammaObjective()
49+
model = lb.LBRegressor(
50+
verbose=True,
51+
init="average",
52+
base_models=(lb.models.Tree(max_depth=2),),
53+
n_estimators=n_estimators,
54+
learning_rate=0.1,
55+
random_state=rs,
56+
objective=obj,
3857
)
3958
return model, [model.partial_fit(X, y).predict(X_test) for _ in range(n_frames)]
4059

4160

4261
quantiles = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
4362

4463

45-
def fit_quantile_regression():
64+
def fit_quantile_regression() -> tuple[lb.LBRegressor, list]:
4665
model = lb.LBRegressor(
4766
verbose=True,
4867
base_models=(lb.models.Tree(max_depth=2),),
@@ -55,12 +74,15 @@ def fit_quantile_regression():
5574

5675

5776
normal_model, normal_preds = fit_normal_distribution()
77+
gamma_mode, gamma_preds = fit_gamma_distribution()
5878
quantile_model, quantile_preds = fit_quantile_regression()
5979

60-
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
80+
fig, ax = plt.subplots(1, 3, figsize=(12, 6))
81+
6182

83+
def animate(i: int) -> tuple[Figure]:
84+
lower, upper = -0.5, 6.5
6285

63-
def animate(i):
6486
fig.suptitle(
6587
"Distribution of House Values: Boosting iterations {}".format(
6688
(i + 1) * n_estimators
@@ -70,12 +92,13 @@ def animate(i):
7092
# Plot the normal distribution
7193
ax[0].cla()
7294
ax[0].set_title("Normal Distribution - 95% Confidence Interval")
95+
norm_obj = lb.NormalObjective()
7396
data = pd.DataFrame(
7497
{
7598
feature_name: X_test[:, 0],
7699
"y": y_test,
77-
"Predicted house value": normal_preds[i][:, 0],
78-
"sigma": np.exp(normal_preds[i][:, 1]),
100+
"Predicted house value": norm_obj.mean(normal_preds[i]),
101+
"sigma": norm_obj.var(normal_preds[i]),
79102
}
80103
).sort_values(by=feature_name)
81104
sns.lineplot(
@@ -89,15 +112,42 @@ def animate(i):
89112
0.95, loc=data["Predicted house value"], scale=data["sigma"]
90113
)
91114
ax[0].fill_between(data[feature_name], interval[0], interval[1], alpha=0.2)
92-
ax[0].set_ylim(-0.5, 5.5)
115+
ax[0].set_ylim(lower, upper)
93116

94117
sns.scatterplot(
95118
x=feature_name, y="y", data=data, ax=ax[0], s=15, color=".2", alpha=0.2
96119
)
97120

98-
# Plot the quantile regression
121+
# Plot the gamma distribution
99122
ax[1].cla()
100-
ax[1].set_title("Quantile Regression")
123+
ax[1].set_title("Gamma Distribution - 95% Confidence Interval")
124+
gamma_obj = lb.GammaObjective()
125+
data = pd.DataFrame(
126+
{
127+
feature_name: X_test[:, 0],
128+
"y": y_test,
129+
"Predicted house value": gamma_obj.mean(gamma_preds[i]),
130+
"shape": gamma_obj.shape(gamma_preds[i]),
131+
"scale": gamma_obj.scale(gamma_preds[i]),
132+
}
133+
).sort_values(by=feature_name)
134+
sns.lineplot(
135+
x=feature_name,
136+
y="Predicted house value",
137+
data=data[[feature_name, "Predicted house value"]],
138+
ax=ax[1],
139+
errorbar=("sd", 0),
140+
)
141+
interval = gamma.interval(0.95, data["shape"], scale=data["scale"])
142+
ax[1].fill_between(data[feature_name], interval[0], interval[1], alpha=0.2)
143+
ax[1].set_ylim(lower, upper)
144+
sns.scatterplot(
145+
x=feature_name, y="y", data=data, ax=ax[1], s=15, color=".2", alpha=0.2
146+
)
147+
148+
# Plot the quantile regression
149+
ax[2].cla()
150+
ax[2].set_title("Quantile Regression")
101151

102152
data = {
103153
feature_name: X_test[:, 0],
@@ -115,14 +165,14 @@ def animate(i):
115165
y="Predicted house value",
116166
data=lines,
117167
style="quantile",
118-
ax=ax[1],
168+
ax=ax[2],
119169
dashes=dashes,
120170
errorbar=("sd", 0),
121171
)
122-
ax[1].set_ylim(-0.5, 5.5)
172+
ax[2].set_ylim(lower, upper)
123173

124174
sns.scatterplot(
125-
x=feature_name, y="y", data=data, ax=ax[1], s=15, color=".2", alpha=0.2
175+
x=feature_name, y="y", data=data, ax=ax[2], s=15, color=".2", alpha=0.2
126176
)
127177

128178
plt.tight_layout()

legateboost/__init__.py

+13-15
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,23 @@
1-
from .legateboost import LBRegressor, LBClassifier
1+
from .legateboost import LBClassifier, LBRegressor
22
from .metrics import (
3+
BaseMetric,
4+
ExponentialMetric,
5+
GammaDevianceMetric,
6+
LogLossMetric,
37
MSEMetric,
4-
NormalLLMetric,
58
NormalCRPSMetric,
6-
GammaDevianceMetric,
9+
NormalLLMetric,
10+
GammaLLMetric,
711
QuantileMetric,
8-
LogLossMetric,
9-
ExponentialMetric,
10-
BaseMetric,
1112
)
1213
from .objectives import (
14+
BaseObjective,
15+
ExponentialObjective,
16+
GammaDevianceObjective,
17+
GammaObjective,
1318
LogLossObjective,
14-
SquaredErrorObjective,
1519
NormalObjective,
16-
GammaDevianceObjective,
1720
QuantileObjective,
18-
ExponentialObjective,
19-
BaseObjective,
20-
)
21-
from .utils import (
22-
pick_col_by_idx,
23-
set_col_by_idx,
24-
mod_col_by_idx,
21+
SquaredErrorObjective,
2522
)
23+
from .utils import mod_col_by_idx, pick_col_by_idx, set_col_by_idx

legateboost/legateboost.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def _get_weighted_gradient(
166166
point summation.
167167
"""
168168
# check input dimensions are consistent
169-
assert y.ndim == pred.ndim == 2
169+
assert y.ndim == pred.ndim == 2, (y.shape, pred.shape)
170170
g, h = self._objective_instance.gradient(
171171
y, self._objective_instance.transform(pred)
172172
)

legateboost/metrics.py

+31-7
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from abc import ABC, abstractmethod
2-
from typing import Tuple
2+
from typing import Dict, Tuple, Type
33

4-
from typing_extensions import Self
4+
from typing_extensions import Self, override
55

66
import cunumeric as cn
77

8-
from .special import erf
8+
from .special import erf, loggamma
99
from .utils import pick_col_by_idx, sample_average, set_col_by_idx
1010

1111

@@ -75,7 +75,7 @@ def name(self) -> str:
7575
return "mse"
7676

7777

78-
def check_normal(y: cn.ndarray, pred: cn.ndarray) -> Tuple[cn.ndarray, cn.ndarray]:
78+
def check_dist_param(y: cn.ndarray, pred: cn.ndarray) -> Tuple[cn.ndarray, cn.ndarray]:
7979
"""Checks for normal distribution inputs."""
8080
if y.size * 2 != pred.size:
8181
raise ValueError("Expected pred to contain mean and sd for each y_i")
@@ -98,7 +98,7 @@ class NormalLLMetric(BaseMetric):
9898
""" # noqa: E501
9999

100100
def metric(self, y: cn.ndarray, pred: cn.ndarray, w: cn.ndarray) -> float:
101-
y, pred = check_normal(y, pred)
101+
y, pred = check_dist_param(y, pred)
102102
w_sum = w.sum()
103103
if w_sum == 0:
104104
return 0
@@ -119,6 +119,29 @@ def name(self) -> str:
119119
return "normal_neg_ll"
120120

121121

122+
class GammaLLMetric(BaseMetric):
123+
"""The mean negative log likelihood of the labels, given parameters
124+
predicted by the model."""
125+
126+
@override
127+
def metric(self, y: cn.ndarray, pred: cn.ndarray, w: cn.ndarray) -> float:
128+
y, pred = check_dist_param(y, pred)
129+
130+
w_sum = w.sum()
131+
if w_sum == 0:
132+
return 0
133+
134+
k = pred[:, :, 0]
135+
b = pred[:, :, 1]
136+
error = -(k - 1) * cn.log(y) + y / (b + 1e-6) + k * cn.log(b) + loggamma(k)
137+
138+
return float(sample_average(error, w))
139+
140+
@override
141+
def name(self) -> str:
142+
return "gamma_neg_ll"
143+
144+
122145
def norm_cdf(x: cn.ndarray) -> cn.ndarray:
123146
"""CDF function for standard normal distribution."""
124147
return 0.5 * (1.0 + erf(x / cn.sqrt(2.0)))
@@ -140,7 +163,7 @@ class NormalCRPSMetric(BaseMetric):
140163
"""
141164

142165
def metric(self, y: cn.ndarray, pred: cn.ndarray, w: cn.ndarray) -> float:
143-
y, pred = check_normal(y, pred)
166+
y, pred = check_dist_param(y, pred)
144167
loc = pred[:, :, 0]
145168
# `NormalObjective` outputs variance instead of scale.
146169
scale = cn.sqrt(pred[:, :, 1])
@@ -287,11 +310,12 @@ def name(self) -> str:
287310
return "exp"
288311

289312

290-
metrics = {
313+
metrics: Dict[str, Type[BaseMetric]] = {
291314
"log_loss": LogLossMetric,
292315
"mse": MSEMetric,
293316
"exp": ExponentialMetric,
294317
"normal_neg_ll": NormalLLMetric,
318+
"gamma_neg_ll": GammaLLMetric,
295319
"normal_crps": NormalCRPSMetric,
296320
"deviance_gamma": GammaDevianceMetric,
297321
}

0 commit comments

Comments
 (0)