diff --git a/doc/api.rst b/doc/api.rst
index fd0f5047..9d5b1f0b 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -6,14 +6,32 @@ API Documentation
 
 .. currentmodule:: pyglmnet
 
-Classes
-=======
+GLM Classes
+===========
 
-.. autoclass:: GLM
-   :members:
+.. currentmodule:: pyglmnet
+
+.. autosummary::
+   :toctree: generated/
+
+   GLM
+   GLMCV
+
+Distribution Classes
+====================
+
+.. currentmodule:: pyglmnet.distributions
+
+.. autosummary::
+   :toctree: generated/
 
-.. autoclass:: GLMCV
-  :members:
+   BaseDistribution
+   Poisson
+   PoissonSoftplus
+   NegBinomialSoftplus
+   Binomial
+   Probit
+   GammaSoftplus
 
 
 Datasets
diff --git a/examples/plot_custom_distributions.py b/examples/plot_custom_distributions.py
new file mode 100644
index 00000000..a13d38f6
--- /dev/null
+++ b/examples/plot_custom_distributions.py
@@ -0,0 +1,76 @@
+"""
+======================
+Rolling out custom GLM
+======================
+
+This is an example demonstrating rolling out your custom
+GLM class using Pyglmnet.
+"""
+########################################################
+
+# Author: Pavan Ramkumar <pavan.ramkumar@gmail.com>
+# License: MIT
+
+from sklearn.model_selection import train_test_split
+from pyglmnet import GLMCV, datasets
+
+########################################################
+# Download and preprocess data files
+
+X, y = datasets.fetch_community_crime_data()
+n_samples, n_features = X.shape
+
+########################################################
+# Split the data into training and test sets
+
+X_train, X_test, y_train, y_test = \
+    train_test_split(X, y, test_size=0.33, random_state=0)
+
+########################################################
+# Now we define our own distribution class. This must
+# inherit from BaseDistribution. The BaseDistribution
+# class requires defining the following methods:
+#   - mu: inverse link function
+#   - grad_mu: gradient of the inverse link function
+#   - log_likelihood: the log likelihood function
+#   - grad_log_likelihood: the gradient of the log
+#     likelihood.
+# All distributions in pyglmnet inherit from BaseDistribution
+#
+# This is really powerful. For instance, we can start from
+# the existing Binomial distribution and override mu and grad_mu
+# if we want to use the cloglog link function.
+
+import numpy as np  # noqa: E402
+from pyglmnet.distributions import Binomial  # noqa: E402
+
+
+class CustomBinomial(Binomial):
+    """Custom binomial distribution."""
+
+    def mu(self, z):
+        """clogclog inverse link"""
+        mu = 1 - np.exp(-np.exp(z))
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = np.exp(1 - np.exp(z))
+        return grad_mu
+
+
+distr = CustomBinomial()
+
+########################################################
+# Now we pass it to the GLMCV class just as before.
+
+# use the default value for reg_lambda
+glm = GLMCV(distr=distr, alpha=0.05, score_metric='pseudo_R2', cv=3,
+            tol=1e-4)
+
+# fit model
+glm.fit(X_train, y_train)
+
+# score the test set prediction
+y_test_hat = glm.predict_proba(X_test)
+print("test set pseudo $R^2$ = %f" % glm.score(X_test, y_test))
diff --git a/pyglmnet/__init__.py b/pyglmnet/__init__.py
index 89ae6e23..028a9030 100644
--- a/pyglmnet/__init__.py
+++ b/pyglmnet/__init__.py
@@ -19,7 +19,7 @@
 __version__ = '1.2.dev0'
 
 
-from .pyglmnet import GLM, GLMCV, _grad_L2loss, _L2loss, simulate_glm, _gradhess_logloss_1d, _loss, ALLOWED_DISTRS
+from .pyglmnet import GLM, GLMCV, _grad_L2loss, _L2loss, simulate_glm, _loss, ALLOWED_DISTRS
 from .utils import softmax, label_binarizer, set_log_level
 from .datasets import fetch_tikhonov_data, fetch_rgc_spike_trains, fetch_community_crime_data, fetch_group_lasso_data
 from . import externals
diff --git a/pyglmnet/distributions.py b/pyglmnet/distributions.py
new file mode 100644
index 00000000..ffdc7279
--- /dev/null
+++ b/pyglmnet/distributions.py
@@ -0,0 +1,442 @@
+"""Exponential family distributions for GLM estimators."""
+
+from abc import ABC, abstractmethod
+import numpy as np
+from scipy.special import expit, log1p, loggamma
+from scipy.stats import norm
+
+from .externals.sklearn.utils import check_random_state
+
+
+def softplus(z):
+    """Numerically stable version of log(1 + exp(z))."""
+    # see stabilizing softplus: http://sachinashanbhag.blogspot.com/2014/05/numerically-approximation-of-log-1-expy.html # noqa
+    mu = z.copy()
+    mu[z > 35] = z[z > 35]
+    mu[z < -10] = np.exp(z[z < -10])
+    mu[(z >= -10) & (z <= 35)] = log1p(np.exp(z[(z >= -10) & (z <= 35)]))
+    return mu
+
+
+class BaseDistribution(ABC):
+    """Base class for distributions."""
+
+    def __init__(self):
+        """init."""
+        pass
+
+    @abstractmethod
+    def mu(self, z):
+        """Inverse link function."""
+        pass
+
+    @abstractmethod
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        pass
+
+    @abstractmethod
+    def log_likelihood(self, y, y_hat, z=None):
+        """Log L2-penalized likelihood."""
+        pass
+
+    def grad_log_likelihood(self):
+        """Gradient of L2-penalized log likelihood."""
+        msg = (f"""Gradients of log likelihood are not specified for {self.__class__} distribution""") # noqa
+        raise NotImplementedError(msg)
+
+    def gradhess_log_likelihood_1d(self):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        msg = (f"""Gradients and Hessians of 1-d log likelihood are not specified for {self.__class__} distribution""") # noqa
+        raise NotImplementedError(msg)
+
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        msg = (f"""Simulation of model is not implemented for {self.__class__} distribution""") # noqa
+        raise NotImplementedError(msg)
+
+    def _z(self, beta0, beta, X):
+        """Compute z to be passed through non-linearity."""
+        return beta0 + np.dot(X, beta)
+
+
+class SoftplusLink:
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = softplus(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = expit(z)
+        return grad_mu
+
+
+class Gaussian(BaseDistribution):
+    """Class for Gaussian distribution."""
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = z
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = np.ones_like(z)
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat):
+        """Log L2-penalized likelihood."""
+        log_likelihood = -0.5 * np.sum((y - y_hat) ** 2)
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        grad_beta0 = np.sum((mu - y) * grad_mu)
+        grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, z):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        gk = np.sum((z - y) * xk)
+        hk = np.sum(xk * xk)
+        return gk, hk
+
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            return _random_state.normal(mu)
+
+
+class Poisson(BaseDistribution):
+    """Class for Poisson distribution."""
+
+    def __init__(self):
+        """init."""
+        self.eta = None
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = z.copy()
+        mu0 = (1 - self.eta) * np.exp(self.eta)
+        mu[z > self.eta] = z[z > self.eta] * np.exp(self.eta) + mu0
+        mu[z <= self.eta] = np.exp(z[z <= self.eta])
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = z.copy()
+        grad_mu[z > self.eta] = \
+            np.ones_like(z)[z > self.eta] * np.exp(self.eta)
+        grad_mu[z <= self.eta] = np.exp(z[z <= self.eta])
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat):
+        """Log L2-penalized likelihood."""
+        eps = np.spacing(1)
+        log_likelihood = np.sum(y * np.log(y_hat + eps) - y_hat)
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
+        grad_beta = ((np.dot(grad_mu.T, X) -
+                      np.dot((y * grad_mu / mu).T, X)).T)
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, z):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        mu = self.mu(z)
+        gk = np.sum((mu[z <= self.eta] - y[z <= self.eta]) *
+                    xk[z <= self.eta]) + \
+            np.exp(self.eta) * \
+            np.sum((1 - y[z > self.eta] / mu[z > self.eta]) *
+                   xk[z > self.eta])
+        hk = np.sum(mu[z <= self.eta] * xk[z <= self.eta] ** 2) + \
+            np.exp(self.eta) ** 2 * \
+            np.sum(y[z > self.eta] / (mu[z > self.eta] ** 2) *
+                   (xk[z > self.eta] ** 2))
+        return gk, hk
+
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            return _random_state.poisson(mu)
+
+
+class PoissonSoftplus(SoftplusLink, Poisson):
+    """Class for Poisson distribution with softplus inverse link."""
+
+    def gradhess_log_likelihood_1d(self, xk, y, z):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        mu = self.mu(z)
+        s = expit(z)
+        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)
+
+        grad_s = s * (1 - s)
+        grad_s_by_mu = grad_s / mu - s / (mu ** 2)
+        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
+        return gk, hk
+
+
+class NegBinomialSoftplus(SoftplusLink, BaseDistribution):
+    """Class for Negative binomial distribution with softplus inverse link."""
+
+    def __init__(self):
+        """init."""
+        self.theta = None
+
+    def log_likelihood(self, y, y_hat):
+        """Log L2-penalized likelihood."""
+        theta = self.theta
+        log_likelihood = \
+            np.sum(loggamma(y + theta) - loggamma(theta) - loggamma(y + 1) +
+                   theta * np.log(theta) + y * np.log(y_hat) - (theta + y) *
+                   np.log(y_hat + theta))
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        theta = self.theta
+        partial_beta_0 = grad_mu * ((theta + y) / (mu + theta) - y / mu)
+        grad_beta0 = np.sum(partial_beta_0)
+        grad_beta = np.dot(partial_beta_0.T, X)
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, z):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        mu = self.mu(z)
+        grad_mu = expit(z)
+        hess_mu = np.exp(-z) * expit(z) ** 2
+        theta = self.theta
+        gradient_beta_j = -grad_mu * (y / mu - (y + theta) / (mu + theta))
+        partial_beta_0_1 = hess_mu * (y / mu - (y + theta) / (mu + theta))
+        partial_beta_0_2 = grad_mu ** 2 * \
+            ((y + theta) / (mu + theta) ** 2 - y / mu ** 2)
+        partial_beta_0 = -(partial_beta_0_1 + partial_beta_0_2)
+        gk = np.dot(gradient_beta_j.T, xk)
+        hk = np.dot(partial_beta_0.T, xk ** 2)
+        return gk, hk
+
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            p = self.theta / (self.theta + mu)  # Probability of success
+            return _random_state.negative_binomial(self.theta, p)
+
+
+class Binomial(BaseDistribution):
+    """Class for binomial distribution."""
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = expit(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = expit(z) * (1 - expit(z))
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat, z=None):
+        """Log L2-penalized likelihood."""
+        # prevents underflow
+        if z is not None:
+            log_likelihood = np.sum(y * z - np.log(1 + np.exp(z)))
+        # for scoring
+        else:
+            log_likelihood = \
+                np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_beta0 = np.sum(mu - y)
+        grad_beta = np.dot((mu - y).T, X).T
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, z):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        mu = self.mu(z)
+        gk = np.sum((mu - y) * xk)
+        hk = np.sum(mu * (1.0 - mu) * xk * xk)
+        return gk, hk
+
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            return _random_state.binomial(1, mu)
+
+
+def _probit_g1(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
+    res[np.abs(z) <= thresh] = np.log(cdfz[np.abs(z) <= thresh])
+    res[z > thresh] = -pdfz[z > thresh] / z[z > thresh]
+    return res
+
+
+def _probit_g2(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = pdfz[z < -thresh] / z[z < -thresh]
+    res[np.abs(z) <= thresh] = np.log(1 - cdfz[np.abs(z) <= thresh])
+    res[z > thresh] = np.log(pdfz[z > thresh] / z[z > thresh])
+    return res
+
+
+def _probit_g3(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = -z[z < -thresh]
+    res[np.abs(z) <= thresh] = \
+        pdfz[np.abs(z) <= thresh] / cdfz[np.abs(z) <= thresh]
+    res[z > thresh] = pdfz[z > thresh]
+    return res
+
+
+def _probit_g4(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = pdfz[z < -thresh]
+    res[np.abs(z) <= thresh] = \
+        pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])
+    res[z > thresh] = z[z > thresh]
+    return res
+
+
+def _probit_g5(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = 0 * z[z < -thresh]
+    res[np.abs(z) <= thresh] = \
+        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
+        cdfz[np.abs(z) <= thresh] + (pdfz[np.abs(z) <= thresh] /
+                                     cdfz[np.abs(z) <= thresh]) ** 2
+    res[z > thresh] = z[z > thresh] * pdfz[z > thresh] + pdfz[z > thresh] ** 2
+    return res
+
+
+def _probit_g6(z, pdfz, cdfz, thresh=5):
+    res = np.zeros_like(z)
+    res[z < -thresh] = \
+        pdfz[z < -thresh] ** 2 - z[z < -thresh] * pdfz[z < -thresh]
+    res[np.abs(z) <= thresh] = \
+        (pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])) ** 2 - \
+        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
+        (1 - cdfz[np.abs(z) <= thresh])
+    res[z > thresh] = 0 * z[z > thresh]
+    return res
+
+
+class Probit(BaseDistribution):
+    """Class for probit distribution."""
+
+    def mu(self, z):
+        """Inverse link function."""
+        mu = norm.cdf(z)
+        return mu
+
+    def grad_mu(self, z):
+        """Gradient of inverse link."""
+        grad_mu = norm.pdf(z)
+        return grad_mu
+
+    def log_likelihood(self, y, y_hat, z=None):
+        """Log L2-penalized likelihood."""
+        if z is not None:
+            pdfz, cdfz = norm.pdf(z), norm.cdf(z)
+            log_likelihood = \
+                np.sum(y * _probit_g1(z, pdfz, cdfz) +
+                       (1 - y) * _probit_g2(z, pdfz, cdfz))
+        else:
+            log_likelihood = \
+                np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_beta0 = np.sum(mu - y)
+        grad_beta = np.dot((mu - y).T, X).T
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, z):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        pdfz = norm.pdf(z)
+        cdfz = norm.cdf(z)
+        gk = -np.sum((y * _probit_g3(z, pdfz, cdfz) -
+                      (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
+        hk = np.sum((y * _probit_g5(z, pdfz, cdfz) +
+                     (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))
+        return gk, hk
+
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            _random_state = check_random_state(random_state)
+            return _random_state.binomial(1, mu)
+
+
+class GammaSoftplus(SoftplusLink, BaseDistribution):
+    """Class for Gamma distribution with softplus inverse link."""
+
+    def log_likelihood(self, y, y_hat, z=None):
+        """Log L2-penalized likelihood."""
+        # see
+        # https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf
+        nu = 1.  # shape parameter, exponential for now
+        log_likelihood = np.sum(nu * (-y / y_hat - np.log(y_hat)))
+        return log_likelihood
+
+    def grad_log_likelihood(self, X, y, beta0, beta):
+        """Gradient of L2-penalized log likelihood."""
+        z = self._z(beta0, beta, X)
+        mu = self.mu(z)
+        grad_mu = self.grad_mu(z)
+        nu = 1.
+        grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
+        grad_beta0 = -nu * np.sum(grad_logl)
+        grad_beta = -nu * np.dot(grad_logl.T, X).T
+        return grad_beta0, grad_beta
+
+    def gradhess_log_likelihood_1d(self, xk, y, z):
+        """One-dimensional Gradient and Hessian of log likelihood."""
+        raise NotImplementedError('cdfast is not implemented for Gamma '
+                                  'distribution')
+
+    def simulate(self, beta0, beta, X, random_state, sample):
+        """Simulate targets y given model beta0, beta, X."""
+        mu = self.mu(self._z(beta0, beta, X))
+        if not sample:
+            return mu
+        else:
+            return np.exp(mu)
diff --git a/pyglmnet/externals/sklearn/utils/validation.py b/pyglmnet/externals/sklearn/utils/validation.py
index 7cda34e4..3e09c504 100644
--- a/pyglmnet/externals/sklearn/utils/validation.py
+++ b/pyglmnet/externals/sklearn/utils/validation.py
@@ -119,7 +119,7 @@ def check_consistent_length(*arrays):
     uniques = np.unique(lengths)
     if len(uniques) > 1:
         raise ValueError("Found input variables with inconsistent numbers of"
-                         " samples: %r" % [int(l) for l in lengths])
+                         " samples: %r" % [int(ll) for ll in lengths])
 
 
 def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
diff --git a/pyglmnet/metrics.py b/pyglmnet/metrics.py
index 27bd1080..d97754bc 100644
--- a/pyglmnet/metrics.py
+++ b/pyglmnet/metrics.py
@@ -1,7 +1,7 @@
-""" A set of scoring functions. """
+"""A set of scoring functions."""
 
 import numpy as np
-from .pyglmnet import _logL
+from .distributions import Poisson, PoissonSoftplus, NegBinomialSoftplus
 
 
 def deviance(y, yhat, distr, theta):
@@ -23,12 +23,13 @@ def deviance(y, yhat, distr, theta):
     score : float
         Deviance of the predicted labels.
     """
-    if distr in ['softplus', 'poisson', 'neg-binomial']:
-        LS = _logL(distr, y, y, theta=theta)
+    if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
+        LS = distr.log_likelihood(y, y)
     else:
         LS = 0
 
-    L1 = _logL(distr, y, yhat, theta=theta)
+    # L1 = _logL(distr, y, yhat, theta=theta)
+    L1 = distr.log_likelihood(y, yhat)
     score = -2 * (L1 - LS)
     return score
 
@@ -55,15 +56,15 @@ def pseudo_R2(X, y, yhat, ynull_, distr, theta):
     score : float
         Pseudo-R2 score.
     """
-    if distr in ['softplus', 'poisson', 'neg-binomial']:
-        LS = _logL(distr, y, y, theta=theta)
+    if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
+        LS = distr.log_likelihood(y, y)
     else:
         LS = 0
 
-    L0 = _logL(distr, y, ynull_, theta=theta)
-    L1 = _logL(distr, y, yhat, theta=theta)
+    L0 = distr.log_likelihood(y, ynull_)
+    L1 = distr.log_likelihood(y, yhat)
 
-    if distr in ['softplus', 'poisson', 'neg-binomial']:
+    if isinstance(distr, (Poisson, PoissonSoftplus, NegBinomialSoftplus)):
         score = (1 - (LS - L1) / (LS - L0))
     else:
         score = (1 - L1 / L0)
diff --git a/pyglmnet/pyglmnet.py b/pyglmnet/pyglmnet.py
index 9e00a9c1..b3fe8b40 100644
--- a/pyglmnet/pyglmnet.py
+++ b/pyglmnet/pyglmnet.py
@@ -4,161 +4,21 @@
 from copy import deepcopy
 
 import numpy as np
-from scipy.special import expit, loggamma
-from scipy.stats import norm
 
 from .utils import logger, set_log_level, _check_params, \
-    _verbose_iterable, _tqdm_log, softplus
+    _verbose_iterable, _tqdm_log
 from .base import BaseEstimator, is_classifier, check_version
 
 from .externals.sklearn.utils import check_random_state, check_array, check_X_y
 from .externals.sklearn.utils.validation import check_is_fitted
 
+from .distributions import BaseDistribution, Gaussian, Poisson, \
+    PoissonSoftplus, NegBinomialSoftplus, Binomial, Probit, GammaSoftplus
+
 ALLOWED_DISTRS = ['gaussian', 'binomial', 'softplus', 'poisson',
                   'probit', 'gamma', 'neg-binomial']
 
 
-def _probit_g1(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = np.log(-pdfz[z < -thresh] / z[z < -thresh])
-    res[np.abs(z) <= thresh] = np.log(cdfz[np.abs(z) <= thresh])
-    res[z > thresh] = -pdfz[z > thresh] / z[z > thresh]
-    return res
-
-
-def _probit_g2(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = pdfz[z < -thresh] / z[z < -thresh]
-    res[np.abs(z) <= thresh] = np.log(1 - cdfz[np.abs(z) <= thresh])
-    res[z > thresh] = np.log(pdfz[z > thresh] / z[z > thresh])
-    return res
-
-
-def _probit_g3(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = -z[z < -thresh]
-    res[np.abs(z) <= thresh] = \
-        pdfz[np.abs(z) <= thresh] / cdfz[np.abs(z) <= thresh]
-    res[z > thresh] = pdfz[z > thresh]
-    return res
-
-
-def _probit_g4(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = pdfz[z < -thresh]
-    res[np.abs(z) <= thresh] = \
-        pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])
-    res[z > thresh] = z[z > thresh]
-    return res
-
-
-def _probit_g5(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = 0 * z[z < -thresh]
-    res[np.abs(z) <= thresh] = \
-        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
-        cdfz[np.abs(z) <= thresh] + (pdfz[np.abs(z) <= thresh] /
-                                     cdfz[np.abs(z) <= thresh]) ** 2
-    res[z > thresh] = z[z > thresh] * pdfz[z > thresh] + pdfz[z > thresh] ** 2
-    return res
-
-
-def _probit_g6(z, pdfz, cdfz, thresh=5):
-    res = np.zeros_like(z)
-    res[z < -thresh] = \
-        pdfz[z < -thresh] ** 2 - z[z < -thresh] * pdfz[z < -thresh]
-    res[np.abs(z) <= thresh] = \
-        (pdfz[np.abs(z) <= thresh] / (1 - cdfz[np.abs(z) <= thresh])) ** 2 - \
-        z[np.abs(z) <= thresh] * pdfz[np.abs(z) <= thresh] / \
-        (1 - cdfz[np.abs(z) <= thresh])
-    res[z > thresh] = 0 * z[z > thresh]
-    return res
-
-
-def _z(beta0, beta, X, fit_intercept):
-    """Compute z to be passed through non-linearity."""
-    if fit_intercept:
-        z = beta0 + np.dot(X, beta)
-    else:
-        z = np.dot(X, np.r_[beta0, beta])
-    return z
-
-
-def _lmb(distr, beta0, beta, X, eta, fit_intercept=True):
-    """Conditional intensity function."""
-    z = _z(beta0, beta, X, fit_intercept)
-    return _mu(distr, z, eta, fit_intercept)
-
-
-def _mu(distr, z, eta, fit_intercept):
-    """The non-linearity (inverse link)."""
-    if distr in ['softplus', 'gamma', 'neg-binomial']:
-        mu = softplus(z)
-    elif distr == 'poisson':
-        mu = z.copy()
-        beta0 = (1 - eta) * np.exp(eta) if fit_intercept else 0.
-        mu[z > eta] = z[z > eta] * np.exp(eta) + beta0
-        mu[z <= eta] = np.exp(z[z <= eta])
-    elif distr == 'gaussian':
-        mu = z
-    elif distr == 'binomial':
-        mu = expit(z)
-    elif distr == 'probit':
-        mu = norm.cdf(z)
-    return mu
-
-
-def _grad_mu(distr, z, eta):
-    """Derivative of the non-linearity."""
-    if distr in ['softplus', 'gamma', 'neg-binomial']:
-        grad_mu = expit(z)
-    elif distr == 'poisson':
-        grad_mu = z.copy()
-        grad_mu[z > eta] = np.ones_like(z)[z > eta] * np.exp(eta)
-        grad_mu[z <= eta] = np.exp(z[z <= eta])
-    elif distr == 'gaussian':
-        grad_mu = np.ones_like(z)
-    elif distr == 'binomial':
-        grad_mu = expit(z) * (1 - expit(z))
-    elif distr in 'probit':
-        grad_mu = norm.pdf(z)
-    return grad_mu
-
-
-def _logL(distr, y, y_hat, z=None, theta=1.0):
-    """The log likelihood."""
-    if distr in ['softplus', 'poisson']:
-        eps = np.spacing(1)
-        logL = np.sum(y * np.log(y_hat + eps) - y_hat)
-    elif distr == 'gaussian':
-        logL = -0.5 * np.sum((y - y_hat)**2)
-    elif distr == 'binomial':
-
-        # prevents underflow
-        if z is not None:
-            logL = np.sum(y * z - np.log(1 + np.exp(z)))
-        # for scoring
-        else:
-            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
-    elif distr == 'probit':
-        if z is not None:
-            pdfz, cdfz = norm.pdf(z), norm.cdf(z)
-            logL = np.sum(y * _probit_g1(z, pdfz, cdfz) +
-                          (1 - y) * _probit_g2(z, pdfz, cdfz))
-        else:
-            logL = np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
-    elif distr == 'gamma':
-        # see
-        # https://www.statistics.ma.tum.de/fileadmin/w00bdb/www/czado/lec8.pdf
-        nu = 1.  # shape parameter, exponential for now
-        logL = np.sum(nu * (-y / y_hat - np.log(y_hat)))
-    elif distr == 'neg-binomial':
-        logL = np.sum(loggamma(y + theta) - loggamma(theta) - loggamma(y + 1) +
-                      theta * np.log(theta) + y * np.log(y_hat) - (theta + y) *
-                      np.log(y_hat + theta))
-    return logL
-
-
 def _penalty(alpha, beta, Tau, group):
     """The penalty."""
     # Combine L1 and L2 penalty terms
@@ -205,9 +65,15 @@ def _loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, group, beta,
           fit_intercept=True):
     """Define the objective function for elastic net."""
     n_samples, n_features = X.shape
-    z = _z(beta[0], beta[1:], X, fit_intercept)
-    y_hat = _mu(distr, z, eta, fit_intercept)
-    L = 1. / n_samples * _logL(distr, y, y_hat, z, theta)
+    if fit_intercept:
+        z = distr._z(beta[0], beta[1:], X)
+    else:
+        z = distr._z(0., beta, X)
+    y_hat = distr.mu(z)
+    if isinstance(distr, (Binomial, Probit)):
+        L = 1. / n_samples * distr.log_likelihood(y, y_hat, z)
+    else:
+        L = 1. / n_samples * distr.log_likelihood(y, y_hat)
     if fit_intercept:
         P = _penalty(alpha, beta[1:], Tau, group)
     else:
@@ -220,9 +86,15 @@ def _L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, group, beta,
             fit_intercept=True):
     """Define the objective function for elastic net."""
     n_samples, n_features = X.shape
-    z = _z(beta[0], beta[1:], X, fit_intercept)
-    y_hat = _mu(distr, z, eta, fit_intercept)
-    L = 1. / n_samples * _logL(distr, y, y_hat, z, theta)
+    if fit_intercept:
+        z = distr._z(beta[0], beta[1:], X)
+    else:
+        z = distr._z(0., beta, X)
+    y_hat = distr.mu(z)
+    if isinstance(distr, (Binomial, Probit)):
+        L = 1. / n_samples * distr.log_likelihood(y, y_hat, z)
+    else:
+        L = 1. / n_samples * distr.log_likelihood(y, y_hat)
     if fit_intercept:
         P = 0.5 * (1 - alpha) * _L2penalty(beta[1:], Tau)
     else:
@@ -244,51 +116,16 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, beta,
             Tau = np.eye(beta.shape[0])
     InvCov = np.dot(Tau.T, Tau)
 
-    z = _z(beta[0], beta[1:], X, fit_intercept)
-    mu = _mu(distr, z, eta, fit_intercept)
-    grad_mu = _grad_mu(distr, z, eta)
-
-    grad_beta0 = 0.
-    if distr in ['poisson', 'softplus']:
-        if fit_intercept:
-            grad_beta0 = np.sum(grad_mu) - np.sum(y * grad_mu / mu)
-        grad_beta = ((np.dot(grad_mu.T, X) -
-                      np.dot((y * grad_mu / mu).T, X)).T)
-
-    elif distr == 'gaussian':
-        if fit_intercept:
-            grad_beta0 = np.sum((mu - y) * grad_mu)
-        grad_beta = np.dot((mu - y).T, X * grad_mu[:, None]).T
-
-    elif distr == 'binomial':
-        if fit_intercept:
-            grad_beta0 = np.sum(mu - y)
-        grad_beta = np.dot((mu - y).T, X).T
-
-    elif distr == 'probit':
-        grad_logl = (y * _probit_g3(z, grad_mu, mu) -
-                     (1 - y) * _probit_g4(z, grad_mu, mu))
-        if fit_intercept:
-            grad_beta0 = -np.sum(grad_logl)
-        grad_beta = -np.dot(grad_logl.T, X).T
-
-    elif distr == 'gamma':
-        nu = 1.
-        grad_logl = (y / mu ** 2 - 1 / mu) * grad_mu
-        if fit_intercept:
-            grad_beta0 = -nu * np.sum(grad_logl)
-        grad_beta = -nu * np.dot(grad_logl.T, X).T
-
-    elif distr == 'neg-binomial':
-        partial_beta_0 = grad_mu * ((theta + y) / (mu + theta) - y / mu)
-
-        if fit_intercept:
-            grad_beta0 = np.sum(partial_beta_0)
-
-        grad_beta = np.dot(partial_beta_0.T, X)
+    if fit_intercept:
+        beta0_, beta_ = beta[0], beta[1:]
+    else:
+        beta0_, beta_ = 0., beta
+    grad_beta0, grad_beta = distr.grad_log_likelihood(X, y, beta0_, beta_)
+    grad_beta0 = grad_beta0 if fit_intercept else 0.
 
     grad_beta0 *= 1. / n_samples
     grad_beta *= 1. / n_samples
+
     if fit_intercept:
         grad_beta += reg_lambda * (1 - alpha) * np.dot(InvCov, beta[1:])
         g = np.zeros((n_features + 1, ))
@@ -301,105 +138,6 @@ def _grad_L2loss(distr, alpha, Tau, reg_lambda, X, y, eta, theta, beta,
     return g
 
 
-def _gradhess_logloss_1d(distr, xk, y, z, eta, theta, fit_intercept=True):
-    """
-    Compute gradient (1st derivative)
-    and Hessian (2nd derivative)
-    of log likelihood for a single coordinate.
-
-    Parameters
-    ----------
-    xk: float
-        (n_samples,)
-    y: float
-        (n_samples,)
-    z: float
-        (n_samples,)
-
-    Returns
-    -------
-    gk: gradient, float:
-        (n_features + 1,)
-    hk: float:
-        (n_features + 1,)
-    """
-    n_samples = xk.shape[0]
-
-    if distr == 'softplus':
-        mu = _mu(distr, z, eta, fit_intercept)
-        s = expit(z)
-        gk = np.sum(s * xk) - np.sum(y * s / mu * xk)
-
-        grad_s = s * (1 - s)
-        grad_s_by_mu = grad_s / mu - s / (mu ** 2)
-        hk = np.sum(grad_s * xk ** 2) - np.sum(y * grad_s_by_mu * xk ** 2)
-
-    elif distr == 'poisson':
-        mu = _mu(distr, z, eta, fit_intercept)
-        s = expit(z)
-        gk = np.sum((mu[z <= eta] - y[z <= eta]) *
-                    xk[z <= eta]) + \
-            np.exp(eta) * \
-            np.sum((1 - y[z > eta] / mu[z > eta]) *
-                   xk[z > eta])
-        hk = np.sum(mu[z <= eta] * xk[z <= eta] ** 2) + \
-            np.exp(eta) ** 2 * \
-            np.sum(y[z > eta] / (mu[z > eta] ** 2) *
-                   (xk[z > eta] ** 2))
-
-    elif distr == 'gaussian':
-        gk = np.sum((z - y) * xk)
-        hk = np.sum(xk * xk)
-
-    elif distr == 'binomial':
-        mu = _mu(distr, z, eta, fit_intercept)
-        gk = np.sum((mu - y) * xk)
-        hk = np.sum(mu * (1.0 - mu) * xk * xk)
-
-    elif distr == 'probit':
-        pdfz = norm.pdf(z)
-        cdfz = norm.cdf(z)
-        gk = -np.sum((y * _probit_g3(z, pdfz, cdfz) -
-                      (1 - y) * _probit_g4(z, pdfz, cdfz)) * xk)
-        hk = np.sum((y * _probit_g5(z, pdfz, cdfz) +
-                     (1 - y) * _probit_g6(z, pdfz, cdfz)) * (xk * xk))
-
-    elif distr == 'neg-binomial':
-        mu = _mu(distr, z, eta, fit_intercept)
-        grad_mu = _grad_mu(distr, z, eta)
-        hess_mu = np.exp(-z) * expit(z)**2
-
-        gradient_beta_j = -grad_mu * (y / mu - (y + theta) / (mu + theta))
-        partial_beta_0_1 = hess_mu * (y / mu - (y + theta) / (mu + theta))
-        partial_beta_0_2 = grad_mu**2 * \
-            ((y + theta) / (mu + theta)**2 - y / mu**2)
-        partial_beta_0 = -(partial_beta_0_1 + partial_beta_0_2)
-        gk = np.dot(gradient_beta_j.T, xk)
-        hk = np.dot(partial_beta_0.T, xk**2)
-
-    elif distr == 'gamma':
-        raise NotImplementedError('cdfast is not implemented for Gamma '
-                                  'distribution')
-    elif distr == 'neg-binomial':
-        mu = _mu(distr, z, eta, fit_intercept)
-        grad_mu = _grad_mu(distr, z, eta)
-        hess_mu = np.exp(-z) * expit(z) ** 2
-
-        gradient_beta_j = -grad_mu * (y / mu - (y + theta) / (mu + theta))
-        partial_beta_0_1 = hess_mu * (y / mu - (y + theta) / (mu + theta))
-        partial_beta_0_2 = grad_mu ** 2 * \
-            ((y + theta) / (mu + theta) ** 2 - y / mu ** 2)
-        partial_beta_0 = -(partial_beta_0_1 + partial_beta_0_2)
-        gk = np.dot(gradient_beta_j.T, xk)
-        hk = np.dot(partial_beta_0.T, xk ** 2)
-
-    elif distr == 'gamma':
-        raise NotImplementedError('cdfast is not implemented for Gamma '
-                                  'distribution')
-
-    return 1. / n_samples * gk, 1. / n_samples * hk
-
-
 def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
                  sample=False, theta=1.0, fit_intercept=True):
     """Simulate target data under a generative model.
@@ -427,9 +165,15 @@ def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
     y: array
         simulated target data of shape (n_samples,)
     """
-    if distr not in ALLOWED_DISTRS:
-        raise ValueError("'distr' must be in %s, got %s"
-                         % (repr(ALLOWED_DISTRS), distr))
+    err_msg = ('distr must be one of %s or a subclass of BaseDistribution. '
+               'Got %s' % (', '.join(ALLOWED_DISTRS), distr))
+    if isinstance(distr, str) and distr not in ALLOWED_DISTRS:
+        raise ValueError(err_msg)
+    if not isinstance(distr, str) and not isinstance(distr, BaseDistribution):
+        raise TypeError(err_msg)
+
+    glm = GLM(distr=distr)
+    glm._set_distr()
 
     if not isinstance(beta0, float):
         raise ValueError("'beta0' must be float, got %s" % type(beta0))
@@ -437,30 +181,10 @@ def simulate_glm(distr, beta0, beta, X, eta=2.0, random_state=None,
     if beta.ndim != 1:
         raise ValueError("'beta' must be 1D, got %dD" % beta.ndim)
 
-    if not sample:
-        return _lmb(distr, beta0, beta, X, eta, fit_intercept=fit_intercept)
-
-    _random_state = check_random_state(random_state)
-    if distr == 'softplus' or distr == 'poisson':
-        y = _random_state.poisson(
-            _lmb(distr, beta0, beta, X, eta,
-                 fit_intercept=fit_intercept))
-    if distr == 'gaussian':
-        y = _random_state.normal(
-            _lmb(distr, beta0, beta, X, eta,
-                 fit_intercept=fit_intercept))
-    if distr == 'binomial' or distr == 'probit':
-        y = _random_state.binomial(
-            1,
-            _lmb(distr, beta0, beta, X, eta,
-                 fit_intercept=fit_intercept))
-    if distr == 'gamma':
-        mu = _lmb(distr, beta0, beta, X, eta, fit_intercept=fit_intercept)
-        y = np.exp(mu)
-    if distr == 'neg-binomial':
-        mu = _lmb(distr, beta0, beta, X, eta, fit_intercept=fit_intercept)
-        p = theta / (theta + mu)  # Probability of success
-        y = _random_state.negative_binomial(theta, p)
+    if not fit_intercept:
+        beta0 = 0.
+
+    y = glm.distr_.simulate(beta0, beta, X, random_state, sample)
     return y
 
 
@@ -495,8 +219,8 @@ class GLM(BaseEstimator):
 
     Parameters
     ----------
-    distr: str
-        distribution family can be one of the following
+    distr: str or object subclassed from BaseDistribution
+        if str, distribution family can be one of the following
         'gaussian' | 'binomial' | 'poisson' | 'softplus'
         | 'probit' | 'gamma' | 'neg-binomial'
         default: 'poisson'.
@@ -604,7 +328,6 @@ def __init__(self, distr='poisson', alpha=0.5,
 
         _check_params(distr=distr, max_iter=max_iter,
                       fit_intercept=fit_intercept)
-
         self.distr = distr
         self.alpha = alpha
         self.reg_lambda = reg_lambda
@@ -623,9 +346,27 @@ def __init__(self, distr='poisson', alpha=0.5,
         self.theta = theta
         set_log_level(verbose)
 
+    def _set_distr(self):
+        distr_lookup = {
+            'gaussian': Gaussian(),
+            'poisson': Poisson(),
+            'softplus': PoissonSoftplus(),
+            'neg-binomial': NegBinomialSoftplus(),
+            'binomial': Binomial(),
+            'probit': Probit(),
+            'gamma': GammaSoftplus()
+        }
+        self.distr_ = distr_lookup[self.distr]
+        if isinstance(self.distr_, Poisson):
+            self.distr_.eta = self.eta
+        if isinstance(self.distr_, NegBinomialSoftplus):
+            self.distr_.theta = self.theta
+
     def _set_cv(cv, estimator=None, X=None, y=None):
-        """Set the default CV depending on whether clf
-           is classifier/regressor."""
+        """Set the default CV.
+
+        Depends on whether clf is classifier/regressor.
+        """
         # Detect whether classification or regression
         if estimator in ['classifier', 'regressor']:
             est_is_classifier = estimator == 'classifier'
@@ -757,7 +498,10 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
         """
         n_samples, n_features = X.shape
         reg_scale = rl * (1 - self.alpha)
-        z = _z(beta[0], beta[1:], X, fit_intercept)
+        if fit_intercept:
+            z = self.distr_._z(beta[0], beta[1:], X)
+        else:
+            z = self.distr_._z(0., beta, X)
         for k in range(0, n_features + int(fit_intercept)):
             # Only update parameters in active set
             if ActiveSet[k] != 0:
@@ -770,8 +514,11 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
                     xk = X[:, k]
 
                 # Calculate grad and hess of log likelihood term
-                gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta,
-                                              self.theta, fit_intercept)
+                # gk, hk = _gradhess_logloss_1d(self.distr, xk, y, z, self.eta,
+                #                               self.theta, fit_intercept)
+                gk, hk = self.distr_.gradhess_log_likelihood_1d(xk, y, z)
+                gk = 1. / n_samples * gk
+                hk = 1. / n_samples * hk
 
                 # Add grad and hess of regularization term
                 if self.Tau is None:
@@ -795,6 +542,7 @@ def _cdfast(self, X, y, ActiveSet, beta, rl, fit_intercept=True):
 
                 # Update parameters, z
                 beta[k], z = beta[k] - update, z - update * xk
+
         return beta
 
     def fit(self, X, y):
@@ -814,7 +562,7 @@ def fit(self, X, y):
             The fitted model.
         """
         X, y = check_X_y(X, y, accept_sparse=False)
-
+        self._set_distr()
         self.beta0_ = None
         self.beta_ = None
         self.ynull_ = None
@@ -887,7 +635,7 @@ def fit(self, X, y):
             self.n_iter_ += 1
             beta_old = beta.copy()
             if self.solver == 'batch-gradient':
-                grad = _grad_L2loss(self.distr,
+                grad = _grad_L2loss(self.distr_,
                                     alpha, self.Tau,
                                     reg_lambda, X, y, self.eta,
                                     self.theta, beta, self.fit_intercept)
@@ -998,10 +746,10 @@ def predict(self, X):
             raise ValueError('Input data should be of type ndarray (got %s).'
                              % type(X))
 
-        yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
-                    fit_intercept=True)
+        z = self.distr_._z(self.beta0_, self.beta_, X)
+        yhat = self.distr_.mu(z)
 
-        if self.distr == 'binomial':
+        if isinstance(self.distr_, Binomial):
             yhat = (yhat > 0.5).astype(int)
         yhat = np.asarray(yhat)
         return yhat
@@ -1020,7 +768,7 @@ def _predict_proba(self, X):
             The predicted targets of shape (n_samples,).
 
         """
-        if self.distr not in ['binomial', 'probit']:
+        if not isinstance(self.distr_, (Binomial, Probit)):
             raise ValueError('This is only applicable for \
                               the binomial distribution.')
 
@@ -1028,8 +776,8 @@ def _predict_proba(self, X):
             raise ValueError('Input data should be of type ndarray (got %s).'
                              % type(X))
 
-        yhat = _lmb(self.distr, self.beta0_, self.beta_, X, self.eta,
-                    fit_intercept=True)
+        z = self.distr_._z(self.beta0_, self.beta_, X)
+        yhat = self.distr_.mu(z)
         yhat = np.asarray(yhat)
         return yhat
 
@@ -1055,7 +803,7 @@ def predict_proba(self, X):
         X = check_array(X, accept_sparse=False)
         check_is_fitted(self, 'is_fitted_')
 
-        if self.distr in ['binomial', 'probit']:
+        if isinstance(self.distr_, (Binomial, Probit)):
             return self._predict_proba(X)
         else:
             warnings.warn('This is only applicable for \
@@ -1110,14 +858,14 @@ def score(self, X, y):
 
         # For f1 as well
         if self.score_metric in ['accuracy']:
-            if self.distr not in ['binomial', 'multinomial']:
+            if not isinstance(self.distr_, (Binomial, Probit)):
                 raise ValueError(self.score_metric +
                                  ' is only defined for binomial ' +
                                  'or multinomial distributions')
 
         y = np.asarray(y).ravel()
 
-        if self.distr in ['binomial', 'probit'] and \
+        if isinstance(self.distr_, (Binomial, Probit)) and \
            self.score_metric != 'accuracy':
             yhat = self.predict_proba(X)
         else:
@@ -1125,10 +873,10 @@ def score(self, X, y):
 
         # Check whether we have a list of estimators or a single estimator
         if self.score_metric == 'deviance':
-            return metrics.deviance(y, yhat, self.distr, self.theta)
+            return metrics.deviance(y, yhat, self.distr_, self.theta)
         elif self.score_metric == 'pseudo_R2':
             return metrics.pseudo_R2(X, y, yhat, self.ynull_,
-                                     self.distr, self.theta)
+                                     self.distr_, self.theta)
         if self.score_metric == 'accuracy':
             return metrics.accuracy(y, yhat)
 
diff --git a/pyglmnet/utils.py b/pyglmnet/utils.py
index ce8bc864..6f9046c0 100644
--- a/pyglmnet/utils.py
+++ b/pyglmnet/utils.py
@@ -1,11 +1,8 @@
-"""
-A few miscellaneous helper functions for pyglmnet.py
-"""
+"""A few miscellaneous helper functions for pyglmnet."""
 
 import numpy as np
 from copy import copy
 import logging
-from scipy.special import log1p
 
 
 logger = logging.getLogger('pyglmnet')
@@ -21,16 +18,6 @@
     has_tqdm = False
 
 
-def softplus(z):
-    """Numerically stable version of log(1 + exp(z))."""
-    # see stabilizing softplus: http://sachinashanbhag.blogspot.com/2014/05/numerically-approximation-of-log-1-expy.html # noqa
-    mu = z.copy()
-    mu[z > 35] = z[z > 35]
-    mu[z < -10] = np.exp(z[z < -10])
-    mu[(z >= -10) & (z <= 35)] = log1p(np.exp(z[(z >= -10) & (z <= 35)]))
-    return mu
-
-
 def softmax(w):
     """Softmax function of given array of number w.
 
@@ -113,9 +100,10 @@ def tikhonov_from_prior(prior_cov, n_samples, threshold=0.0001):
 def _check_params(distr, max_iter, fit_intercept):
     from .pyglmnet import ALLOWED_DISTRS
 
+    err_msg = ('distr must be one of %s or a subclass of BaseDistribution. '
+               'Got %s' % (', '.join(ALLOWED_DISTRS), distr))
     if distr not in ALLOWED_DISTRS:
-        raise ValueError('distr must be one of %s, Got '
-                         '%s' % (', '.join(ALLOWED_DISTRS), distr))
+        raise ValueError(err_msg)
 
     if not isinstance(max_iter, int):
         raise ValueError('max_iter must be of type int')
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
new file mode 100644
index 00000000..ef4fe3c4
--- /dev/null
+++ b/tests/test_distributions.py
@@ -0,0 +1,71 @@
+"""Tests for distributions."""
+
+from functools import partial
+
+import pytest
+
+import numpy as np
+from numpy.testing import assert_allclose
+
+import scipy.sparse as sps
+from scipy.optimize import approx_fprime
+from sklearn.preprocessing import StandardScaler
+
+from pyglmnet import ALLOWED_DISTRS, simulate_glm, GLM, _grad_L2loss, _L2loss
+from pyglmnet.distributions import BaseDistribution
+
+
+def test_base_distribution():
+    """Test the base distribution."""
+    class TestDistr(BaseDistribution):
+        def mu():
+            pass
+
+        def grad_mu():
+            pass
+
+    class TestDistr2(TestDistr):
+        def log_likelihood():
+            pass
+
+    with pytest.raises(TypeError, match='abstract methods log_likelihood'):
+        distr = TestDistr()
+
+    msg = 'Gradients of log likelihood are not specified'
+    with pytest.raises(NotImplementedError, match=msg):
+        distr = TestDistr2()
+        distr.grad_log_likelihood()
+
+    msg = 'distr must be one of'
+    with pytest.raises(ValueError, match=msg):
+        GLM(distr='blah')
+
+
+@pytest.mark.parametrize("distr", ALLOWED_DISTRS)
+def test_gradients(distr):
+    """Test gradient accuracy."""
+    # data
+    scaler = StandardScaler()
+    n_samples, n_features = 1000, 100
+    X = np.random.normal(0.0, 1.0, [n_samples, n_features])
+    X = scaler.fit_transform(X)
+
+    density = 0.1
+    beta_ = np.zeros(n_features + 1)
+    beta_[0] = np.random.rand()
+    beta_[1:] = sps.rand(n_features, 1, density=density).toarray()[:, 0]
+
+    reg_lambda = 0.1
+
+    glm = GLM(distr=distr, reg_lambda=reg_lambda)
+    glm._set_distr()
+    y = simulate_glm(glm.distr, beta_[0], beta_[1:], X)
+
+    func = partial(_L2loss, glm.distr_, glm.alpha,
+                   glm.Tau, reg_lambda, X, y, glm.eta, glm.theta, glm.group)
+    grad = partial(_grad_L2loss, glm.distr_, glm.alpha, glm.Tau,
+                   reg_lambda, X, y,
+                   glm.eta, glm.theta)
+    approx_grad = approx_fprime(beta_, func, 1.5e-8)
+    analytical_grad = grad(beta_)
+    assert_allclose(approx_grad, analytical_grad, rtol=1e-5, atol=1e-3)
diff --git a/tests/test_pyglmnet.py b/tests/test_pyglmnet.py
index b9401a63..7f10e689 100644
--- a/tests/test_pyglmnet.py
+++ b/tests/test_pyglmnet.py
@@ -3,14 +3,12 @@
 import subprocess
 import os.path as op
 
-from functools import partial
 import pytest
 import numpy as np
 from numpy.testing import assert_allclose, assert_array_equal
 from pytest import raises
 
 import scipy.sparse as sps
-from scipy.optimize import approx_fprime
 
 from sklearn.datasets import make_regression
 from sklearn.preprocessing import StandardScaler
@@ -18,8 +16,8 @@
 from sklearn.linear_model import ElasticNet
 from sklearn.utils.estimator_checks import check_estimator
 
-from pyglmnet import (GLM, GLMCV, _grad_L2loss, _L2loss, simulate_glm,
-                      _gradhess_logloss_1d, _loss, datasets, ALLOWED_DISTRS)
+from pyglmnet import (GLM, GLMCV, simulate_glm,
+                      _loss, datasets, ALLOWED_DISTRS)
 
 matplotlib.use('agg')
 
@@ -29,35 +27,6 @@ def test_glm_estimator():
     check_estimator(GLM)
 
 
-@pytest.mark.parametrize("distr", ALLOWED_DISTRS)
-def test_gradients(distr):
-    """Test gradient accuracy."""
-    # data
-    scaler = StandardScaler()
-    n_samples, n_features = 1000, 100
-    X = np.random.normal(0.0, 1.0, [n_samples, n_features])
-    X = scaler.fit_transform(X)
-
-    density = 0.1
-    beta_ = np.zeros(n_features + 1)
-    beta_[0] = np.random.rand()
-    beta_[1:] = sps.rand(n_features, 1, density=density).toarray()[:, 0]
-
-    reg_lambda = 0.1
-
-    glm = GLM(distr=distr, reg_lambda=reg_lambda)
-    y = simulate_glm(glm.distr, beta_[0], beta_[1:], X)
-
-    func = partial(_L2loss, distr, glm.alpha,
-                   glm.Tau, reg_lambda, X, y, glm.eta, glm.theta, glm.group)
-    grad = partial(_grad_L2loss, distr, glm.alpha, glm.Tau,
-                   reg_lambda, X, y,
-                   glm.eta, glm.theta)
-    approx_grad = approx_fprime(beta_, func, 1.5e-8)
-    analytical_grad = grad(beta_)
-    assert_allclose(approx_grad, analytical_grad, rtol=1e-5, atol=1e-3)
-
-
 def test_tikhonov():
     """Tikhonov regularization test."""
     n_samples, n_features = 100, 10
@@ -203,10 +172,13 @@ def test_glmnet(distr, reg_lambda, fit_intercept, solver):
         group = None
         Tau = None
 
+        glm_ = GLM(distr=distr)
+        glm_._set_distr()
+
         def callback(beta):
             Tau = None
             loss_trace.append(
-                _loss(distr, alpha, Tau, reg_lambda,
+                _loss(glm_.distr_, alpha, Tau, reg_lambda,
                       X_train, y_train, eta, theta, group, beta,
                       fit_intercept=fit_intercept))
 
@@ -225,7 +197,7 @@ def callback(beta):
         # true loss and beta should be recovered when reg_lambda == 0
         if reg_lambda == 0.:
             # verify loss at convergence = loss when beta=beta_
-            l_true = _loss(distr, alpha, Tau, reg_lambda,
+            l_true = _loss(glm_.distr_, alpha, Tau, reg_lambda,
                            X_train, y_train, eta, theta, group,
                            np.concatenate(([beta0], beta)))
             assert_allclose(loss_trace[-1], l_true, rtol=1e-4, atol=1e-5)
@@ -316,7 +288,7 @@ def test_compare_sklearn(solver):
     def rmse(a, b):
         return np.sqrt(np.mean((a - b) ** 2))
 
-    X, Y, coef_ = make_regression(
+    X, y, coef_ = make_regression(
         n_samples=1000, n_features=500,
         noise=0.1, n_informative=10, coef=True,
         random_state=42)
@@ -325,18 +297,18 @@ def rmse(a, b):
     l1_ratio = 0.5
 
     clf = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, tol=1e-5)
-    clf.fit(X, Y)
+    clf.fit(X, y)
     glm = GLM(distr='gaussian', alpha=l1_ratio, reg_lambda=alpha,
               solver=solver, tol=1e-6, max_iter=500)
-    glm.fit(X, Y)
+    glm.fit(X, y)
 
     y_sk = clf.predict(X)
     y_pg = glm.predict(X)
-    assert abs(rmse(Y, y_sk) - rmse(Y, y_pg)) < 0.5
+    assert abs(rmse(y, y_sk) - rmse(y, y_pg)) < 0.5
 
     glm = GLM(distr='gaussian', alpha=l1_ratio, reg_lambda=alpha,
               solver=solver, tol=1e-6, max_iter=5, fit_intercept=False)
-    glm.fit(X, Y)
+    glm.fit(X, y)
     assert glm.beta0_ == 0.
 
     glm.predict(X)
@@ -356,6 +328,7 @@ def test_cdfast(distr):
         return
 
     glm = GLM(distr, solver='cdfast')
+    glm._set_distr()
 
     np.random.seed(glm.random_state)
 
@@ -374,7 +347,9 @@ def test_cdfast(distr):
     z = beta_[0] + np.dot(X, beta_[1:])
     k = 1
     xk = X[:, k - 1]
-    gk, hk = _gradhess_logloss_1d(glm.distr, xk, y, z, glm.eta, glm.theta)
+    gk, hk = glm.distr_.gradhess_log_likelihood_1d(xk, y, z)
+    gk = 1. / n_samples * gk
+    hk = 1. / n_samples * hk
 
     # test grad and hess
     if distr != 'multinomial':
@@ -438,7 +413,6 @@ def test_random_state_consistency():
 @pytest.mark.parametrize("distr", ALLOWED_DISTRS)
 def test_simulate_glm(distr):
     """Test that every generative model can be simulated from."""
-
     random_state = 1
     state = np.random.RandomState(random_state)
     n_samples, n_features = 10, 3
@@ -458,13 +432,12 @@ def test_simulate_glm(distr):
 
     # If the distribution name is garbage it will fail
     distr = 'multivariate_gaussian_poisson'
-    with pytest.raises(ValueError, match="'distr' must be in"):
+    with pytest.raises(ValueError):
         simulate_glm(distr, 1.0, 1.0, np.array([[1.0]]))
 
 
 def test_api_input():
     """Test that the input value of y can be of different types."""
-
     random_state = 1
     state = np.random.RandomState(random_state)
     n_samples, n_features = 100, 5