implement sample weights

ensure _logL calls use z argument by name
glm-tools · jasmainak · Jan 23, 2019 · Mar 1, 2019 · Mar 5, 2019 · Mar 5, 2019
commit 192d938bde27396b4da21854209a15fdc673862a
diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py
@@ -3,8 +3,7 @@
 import numpy as np
 from .pyglmnet import _logL
 
-
-def deviance(y, yhat, distr):
+def deviance(y, yhat, sample_weight, distr):
     """Deviance metrics.
 
     Parameters
@@ -15,6 +14,9 @@ def deviance(y, yhat, distr):
     yhat : array
         Predicted labels of shape (n_samples, )
 
+    sample_weight : array
+        Sample weights of shape (n_samples, )
+
     distr: str
         distribution
 
@@ -24,16 +26,15 @@ def deviance(y, yhat, distr):
         Deviance of the predicted labels.
     """
     if distr in ['softplus', 'poisson']:
-        LS = _logL(distr, y, y)
+        LS = _logL(distr, y, y, w=sample_weight)
     else:
         LS = 0
 
-    L1 = _logL(distr, y, yhat)
+    L1 = _logL(distr, y, yhat, w=sample_weight)
     score = -2 * (L1 - LS)
     return score
 
-
-def pseudo_R2(X, y, yhat, ynull_, distr):
+def pseudo_R2(X, y, yhat, ynull_, sample_weight, distr):
     """Pseudo-R2 metric.
 
     Parameters
@@ -47,6 +48,9 @@ def pseudo_R2(X, y, yhat, ynull_, distr):
     ynull_ : float
         Mean of the target labels (null model prediction)
 
+    sample_weight : array
+        Sample weights of shape (n_samples, )
+
     distr: str
         distribution
 
@@ -56,21 +60,20 @@ def pseudo_R2(X, y, yhat, ynull_, distr):
         Pseudo-R2 score.
     """
     if distr in ['softplus', 'poisson']:
-        LS = _logL(distr, y, y)
+        LS = _logL(distr, y, y, w=sample_weight)
     else:
         LS = 0
 
-    L0 = _logL(distr, y, ynull_)
-    L1 = _logL(distr, y, yhat)
+    L0 = _logL(distr, y, ynull_, w=sample_weight)
+    L1 = _logL(distr, y, yhat, w=sample_weight)
 
     if distr in ['softplus', 'poisson']:
         score = (1 - (LS - L1) / (LS - L0))
     else:
         score = (1 - L1 / L0)
     return score
 
-
-def accuracy(y, yhat):
+def accuracy(y, yhat, sample_weight):
     """Accuracy as ratio of correct predictions.
 
     Parameters
@@ -81,9 +84,12 @@ def accuracy(y, yhat):
     yhat : array
         Predicted labels of shape (n_samples, )
 
+    sample_weight : array
+        Sample weights of shape (n_samples, )
+
     Returns
     -------
     accuracy : float
         Accuracy score.
     """
-    return float(np.sum(y == yhat)) / yhat.shape[0]
+    return float(np.dot(sample_weight, (y == yhat))) / np.sum(sample_weight)
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
@@ -112,33 +112,33 @@ def _grad_mu(distr, z, eta):
     return grad_mu
 
 
-def _logL(distr, y, y_hat, z=None):
+def _logL(distr, y, y_hat, w, z=None):
     """The log likelihood."""
     if distr in ['softplus', 'poisson']:
         eps = np.spacing(1)
-        logL = np.sum(y * np.log(y_hat + eps) - y_hat)
+        logL = np.dot(w, y * np.log(y_hat + eps) - y_hat)
     elif distr == 'gaussian':
-        logL = -0.5 * np.sum((y - y_hat)**2)
+        logL = -0.5 * np.dot(w, (y - y_hat)**2)
     elif distr == 'binomial':
 
         # prevents underflow
         if z is not None:
-            logL = np.sum(y * z - np.log(1 + np.exp(z)))
+            logL = np.dot(w, y * z - np.log(1 + np.exp(z)))
         # for scoring
         else:
-            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+            logL = np.dot(w, y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
     elif distr == 'probit':
         if z is not None:
             pdfz, cdfz = norm.pdf(z), norm.cdf(z)
-            logL = np.sum(y * _probit_g1(z, pdfz, cdfz) +
+            logL = np.dot(w, y * _probit_g1(z, pdfz, cdfz) +
                           (1 - y) * _probit_g2(z, pdfz, cdfz))
         else:
-            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+            logL = np.dot(w, y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
     elif distr == 'gamma':
         # see
         # https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf
         nu = 1.  # shape parameter, exponential for now
-        logL = np.sum(nu * (-y / y_hat - np.log(y_hat)))
+        logL = np.dot(w, nu * (-y / y_hat - np.log(y_hat)))
     return logL
 
 
@@ -183,30 +183,28 @@ def _L1penalty(beta, group=None):
         L1penalty += np.linalg.norm(beta[group == 0], 1)
     return L1penalty
 
-
-def _loss(distr, alpha, Tau, reg_lambda, X, y, eta, group, beta):
+def _loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta):
     """Define the objective function for elastic net."""
     n_samples = X.shape[0]
     z = beta[0] + np.dot(X, beta[1:])
     y_hat = _mu(distr, z, eta)
-    L = 1. / n_samples * _logL(distr, y, y_hat, z)
+    L = 1. / n_samples * _logL(distr, y, y_hat, w=w, z=z)
     P = _penalty(alpha, beta[1:], Tau, group)
     J = -L + reg_lambda * P
     return J
 
-
-def _L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, group, beta):
+def _L2loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, group, beta):
     """Define the objective function for elastic net."""
     n_samples = X.shape[0]
     z = beta[0] + np.dot(X, beta[1:])
     y_hat = _mu(distr, z, eta)
-    L = 1. / n_samples * _logL(distr, y, y_hat, z)
+    L = 1. / n_samples * _logL(distr, y, y_hat, w=w, z=z)
     P = 0.5 * (1 - alpha) * _L2penalty(beta[1:], Tau)
     J = -L + reg_lambda * P
     return J
 
 
-def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta):
+def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, w, eta, beta):
     """The gradient."""
     n_samples = np.float(X.shape[0])
 
@@ -219,29 +217,29 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta):
     grad_mu = _grad_mu(distr, z, eta)
 
     if distr in ['poisson', 'softplus']:
-        grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
-        grad_beta = ((np.dot(grad_mu.T, X) -
-                     np.dot((y * grad_mu / mu).T, X)).T)
+        grad_beta0 = np.sum(w * grad_mu) - np.sum(w * y * grad_mu / mu)
+        grad_beta = ((np.dot(w * grad_mu.T, X) -
+                     np.dot((w * y * grad_mu / mu).T, X)).T)
 
     elif distr == 'gaussian':
-        grad_beta0 = np.sum((mu - y) * grad_mu)
-        grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
+        grad_beta0 = np.sum(w * (mu - y) * grad_mu)
+        grad_beta = np.dot((w * (mu - y)).T, X * grad_mu[:, None]).T
 
     elif distr == 'binomial':
-        grad_beta0 = np.sum(mu - y)
-        grad_beta = np.dot((mu - y).T, X).T
+        grad_beta0 = np.sum(w * (mu - y))
+        grad_beta = np.dot((w * (mu - y)).T, X).T
 
     elif distr == 'probit':
         grad_logl = (y * _probit_g3(z, grad_mu, mu) -
                      (1 - y) * _probit_g4(z, grad_mu, mu))
-        grad_beta0 = -np.sum(grad_logl)
-        grad_beta = -np.dot(grad_logl.T, X).T
+        grad_beta0 = -np.sum(w * grad_logl)
+        grad_beta = -np.dot((w * grad_logl).T, X).T
 
     elif distr == 'gamma':
         nu = 1.
         grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
-        grad_beta0 = -nu * np.sum(grad_logl)
-        grad_beta = -nu * np.dot(grad_logl.T, X).T
+        grad_beta0 = -nu * np.sum(w * grad_logl)
+        grad_beta = -nu * np.dot((w * grad_logl).T, X).T
 
     grad_beta0 *= 1. / n_samples
     grad_beta *= 1. / n_samples
@@ -252,8 +250,7 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, beta):
     g[1:] = grad_beta
     return g
 
-
-def _gradhess_logloss_1d(distr, xk, y, z, eta):
+def _gradhess_logloss_1d(distr, xk, y, w, z, eta):
     """
     Compute gradient (1st derivative)
     and Hessian (2nd derivative)
@@ -280,41 +277,41 @@ def _gradhess_logloss_1d(distr, xk, y, z, eta):
     if distr == 'softplus':
         mu = _mu(distr, z, eta)
         s = expit(z)
-        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)
+        gk = np.sum(w * s * xk) - np.sum(w * y * s / mu * xk)
 
         grad_s = s * (1 - s)
         grad_s_by_mu = grad_s / mu - s / (mu ** 2)
-        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
+        hk = np.sum(w * grad_s * xk ** 2) - np.sum(w * y * grad_s_by_mu * xk ** 2)
 
     elif distr == 'poisson':
         mu = _mu(distr, z, eta)
         s = expit(z)
-        gk = np.sum((mu[z <= eta] - y[z <= eta]) *
+        gk = np.sum(w[z <= eta] * (mu[z <= eta] - y[z <= eta]) *
                     xk[z <= eta]) + \
             np.exp(eta) * \
-            np.sum((1 - y[z > eta] / mu[z > eta]) *
+            np.sum(w[z > eta] * (1 - y[z > eta] / mu[z > eta]) *
                    xk[z > eta])
-        hk = np.sum(mu[z <= eta] * xk[z <= eta] ** 2) + \
+        hk = np.sum(w[z <= eta] * mu[z <= eta] * xk[z <= eta] ** 2) + \
             np.exp(eta) ** 2 * \
-            np.sum(y[z > eta] / (mu[z > eta] ** 2) *
+            np.sum(w[z > eta] * y[z > eta] / (mu[z > eta] ** 2) *
                    (xk[z > eta] ** 2))
 
     elif distr == 'gaussian':
-        gk = np.sum((z - y) * xk)
-        hk = np.sum(xk * xk)
+        gk = np.sum(w * (z - y) * xk)
+        hk = np.sum(w * xk * xk)
 
     elif distr == 'binomial':
         mu = _mu(distr, z, eta)
-        gk = np.sum((mu - y) * xk)
-        hk = np.sum(mu * (1.0 - mu) * xk * xk)
+        gk = np.sum(w * (mu - y) * xk)
+        hk = np.sum(w * mu * (1.0 - mu) * xk * xk)
 
     elif distr == 'probit':
         pdfz = norm.pdf(z)
         cdfz = norm.cdf(z)
-        gk = -np.sum((y * _probit_g3(z, pdfz, cdfz) -
-                      (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
-        hk = np.sum((y * _probit_g5(z, pdfz, cdfz) +
-                     (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))
+        gk = -np.sum(w * (y * _probit_g3(z, pdfz, cdfz) -
+                          (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
+        hk = np.sum(w * (y * _probit_g5(z, pdfz, cdfz) +
+                         (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))
 
     elif distr == 'gamma':
         raise NotImplementedError('cdfast is not implemented for Gamma '
@@ -598,7 +595,7 @@ def _prox(self, beta, thresh):
 
             return result
 
-    def _cdfast(self, X, y, z, ActiveSet, beta, rl):
+    def _cdfast(self, X, y, w, z, ActiveSet, beta, rl):
         """
         Perform one cycle of Newton updates for all coordinates.
 
@@ -610,6 +607,9 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl):
         y : array
             Labels to the data
             n_samples x 1
+        w : array
+            Weights to the data
+            n_samples x 1
         z:  array
             n_samples x 1
             beta[0] + X * beta[1:]
@@ -644,7 +644,7 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl):
                     xk = np.ones((n_samples, ))
 
                 # Calculate grad and hess of log likelihood term
-                gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta)
+                gk, hk = _gradhess_logloss_1d(self.distr, xk, y, w, z, self.eta)
 
                 # Add grad and hess of regularization term
                 if self.Tau is None:
@@ -662,7 +662,7 @@ def _cdfast(self, X, y, z, ActiveSet, beta, rl):
                 beta[k], z = beta[k] - update, z - update * xk
         return beta, z
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight = None):
         """The fit function.
 
         Parameters
@@ -697,6 +697,13 @@ def fit(self, X, y):
 
         n_features = X.shape[1]
 
+        # normalize weights so that X.shape[0] = sum(sample_weight)
+        # as assumed by weighted sum calculations in loss functions
+        if sample_weight is None:
+            sample_weight = np.ones_like(y)
+        else:
+            sample_weight /= np.mean(sample_weight)
+
         # Initialize parameters
         beta = np.zeros((n_features + 1,))
         if self.beta0_ is None and self.beta_ is None:
@@ -725,7 +732,7 @@ def fit(self, X, y):
             if self.solver == 'batch-gradient':
                 grad = _grad_L2loss(self.distr,
                                     alpha, self.Tau,
-                                    reg_lambda, X, y, self.eta,
+                                    reg_lambda, X, y, sample_weight, self.eta,
                                     beta)
                 # Converged if the norm(gradient) < tol
                 convergence_metric = np.linalg.norm(grad)
@@ -742,7 +749,7 @@ def fit(self, X, y):
             elif self.solver == 'cdfast':
                 beta_old = deepcopy(beta)
                 beta, z = \
-                    self._cdfast(X, y, z, ActiveSet, beta, reg_lambda)
+                    self._cdfast(X, y, sample_weight, z, ActiveSet, beta, reg_lambda)
                 # Converged if the norm(update) < tol
                 convergence_metric = np.linalg.norm(beta - beta_old)
                 logger.info("convergence_metric: %f" % convergence_metric)
@@ -778,7 +785,7 @@ def fit(self, X, y):
         # Update the estimated variables
         self.beta0_ = beta[0]
         self.beta_ = beta[1:]
-        self.ynull_ = np.mean(y)
+        self.ynull_ = np.sum(sample_weight * y)/np.sum(sample_weight)
         return self
 
     def predict(self, X):
@@ -838,7 +845,7 @@ def predict_proba(self, X):
         yhat = np.asarray(yhat)
         return yhat
 
-    def fit_predict(self, X, y):
+    def fit_predict(self, X, y, sample_weight):
         """Fit the model and predict on the same data.
 
         Parameters
@@ -853,9 +860,9 @@ def fit_predict(self, X, y):
         yhat : array
             The predicted targets of shape (n_samples,).
         """
-        return self.fit(X, y).predict(X)
+        return self.fit(X, y, sample_weight).predict(X)
 
-    def score(self, X, y):
+    def score(self, X, y, sample_weight = None):
         """Score the model.
 
         Parameters
@@ -890,6 +897,10 @@ def score(self, X, y):
                                  'or multinomial distributions')
 
         y = y.ravel()
+        if sample_weight is None:
+            sample_weight = np.ones_like(y)
+        else:
+            sample_weight /= np.mean(sample_weight)
 
         if self.distr == 'binomial' and self.score_metric != 'accuracy':
             yhat = self.predict_proba(X)
@@ -898,11 +909,11 @@ def score(self, X, y):
 
         # Check whether we have a list of estimators or a single estimator
         if self.score_metric == 'deviance':
-            return metrics.deviance(y, yhat, self.distr)
+            return metrics.deviance(y, yhat, sample_weight, self.distr)
         elif self.score_metric == 'pseudo_R2':
-            return metrics.pseudo_R2(X, y, yhat, self.ynull_, self.distr)
+            return metrics.pseudo_R2(X, y, yhat, self.ynull_, sample_weight, self.distr)
         if self.score_metric == 'accuracy':
-            return metrics.accuracy(y, yhat)
+            return metrics.accuracy(y, yhat, sample_weight)
 
 
 class GLMCV(object):
@@ -1077,7 +1088,7 @@ def copy(self):
         """
         return deepcopy(self)
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight = None):
         """The fit function.
         Parameters
         ----------
@@ -1094,7 +1105,11 @@ def fit(self, X, y):
         """
         logger.info('Looping through the regularization path')
         glms, scores = list(), list()
-        self.ynull_ = np.mean(y)
+        if sample_weight is None:
+            sample_weight = np.ones_like(y)
+        else:
+            sample_weight /= np.mean(sample_weight)
+        self.ynull_ = np.sum(sample_weight * y)/np.sum(sample_weight)
 
         if not type(int):
             raise ValueError('cv must be int. We do not support scikit-learn '
@@ -1129,15 +1144,15 @@ def fit(self, X, y):
                 else:
                     glm.beta0_, glm.beta_ = glms[-1].beta0_, glms[-1].beta_
 
-                glm.fit(X[train], y[train])
-                scores_fold.append(glm.score(X[val], y[val]))
+                glm.fit(X[train], y[train], sample_weight[train])
+                scores_fold.append(glm.score(X[val], y[val], sample_weight[val]))
             scores.append(np.mean(scores_fold))
 
             if idx == 0:
                 glm.beta0_, glm.beta_ = self.beta0_, self.beta_
             else:
                 glm.beta0_, glm.beta_ = glms[-1].beta0_, glms[-1].beta_
-            glm.fit(X, y)
+            glm.fit(X, y, sample_weight)
             glms.append(glm)
         # Update the estimated variables
         if self.score_metric == 'deviance':
@@ -1187,7 +1202,7 @@ def predict_proba(self, X):
         """
         return self.glm_.predict_proba(X)
 
-    def fit_predict(self, X, y):
+    def fit_predict(self, X, y, sample_weight = None):
         """Fit the model and predict on the same data.
 
         Parameters
@@ -1202,10 +1217,10 @@ def fit_predict(self, X, y):
             The predicted targets of shape based on the model with optimal
             reg_lambda (n_samples,)
         """
-        self.fit(X, y)
+        self.fit(X, y, sample_weight)
         return self.glm_.predict(X)
 
-    def score(self, X, y):
+    def score(self, X, y, sample_weight = None):
         """Score the model.
 
         Parameters
@@ -1222,4 +1237,4 @@ def score(self, X, y):
         score: float
             The score metric for the optimal reg_lambda
         """
-        return self.glm_.score(X, y)
+        return self.glm_.score(X, y, sample_weight)
diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py
@@ -44,6 +44,7 @@ def test_gradients():
     n_samples, n_features = 1000, 100
     X = np.random.normal(0.0, 1.0, [n_samples, n_features])
     X = scaler.fit_transform(X)
+    w = np.ones(n_samples)
 
     density = 0.1
     beta_ = np.zeros(n_features + 1)
@@ -57,9 +58,9 @@ def test_gradients():
         y = simulate_glm(glm.distr, beta_[0], beta_[1:], X)
 
         func = partial(_L2loss, distr, glm.alpha,
-                       glm.Tau, reg_lambda, X, y, glm.eta, glm.group)
+                       glm.Tau, reg_lambda, X, y, w, glm.eta, glm.group)
         grad = partial(_grad_L2loss, distr, glm.alpha, glm.Tau,
-                       reg_lambda, X, y,
+                       reg_lambda, X, y, w, 
                        glm.eta)
         approx_grad = approx_fprime(beta_, func, 1.5e-8)
         analytical_grad = grad(beta_)
@@ -200,6 +201,7 @@ def test_glmnet():
             X_train = np.random.normal(0.0, 1.0, [n_samples, n_features])
             y_train = simulate_glm(distr, beta0, beta, X_train,
                                    sample=False)
+            w_train = np.ones_like(y_train)
 
             alpha = 0.
             reg_lambda = 0.
@@ -212,7 +214,7 @@ def callback(beta):
 
                 loss_trace.append(
                     _loss(distr, alpha, Tau, reg_lambda,
-                          X_train, y_train, eta, group, beta))
+                          X_train, y_train, w_train, eta, group, beta))
 
             glm = GLM(distr, learning_rate=learning_rate,
                       reg_lambda=reg_lambda, tol=1e-3, max_iter=5000,
@@ -227,7 +229,7 @@ def callback(beta):
 
             # verify loss at convergence = loss when beta=beta_
             l_true = _loss(distr, 0., np.eye(beta.shape[0]), 0.,
-                           X_train, y_train, 2.0, None,
+                           X_train, y_train, w_train, 2.0, None,
                            np.concatenate(([beta0], beta)))
             assert_allclose(loss_trace[-1], l_true, rtol=1e-4, atol=1e-5)
             # beta=beta_ when reg_lambda = 0.