Skip to content

Commit 0fcc7d3

Browse files
tartopohmmdhaber
andauthored
ENH: stats: improved MLE for the lognormal distribution (scipy#16839)
* ENH: stats: improved MLE for the lognormal distribution When the location is known, the MLEs for shape and scale have analytical expressions (two-parameter lognormal); this case was already handled. When the location is unknown, the three-variable numerical optimization can be cast to a one-variable optimization. Co-authored-by: Matt Haberland <[email protected]>
1 parent 69e0c47 commit 0fcc7d3

File tree

3 files changed

+120
-51
lines changed

3 files changed

+120
-51
lines changed

benchmarks/benchmarks/stats.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ def time_binned_statistic_dd_reuse_bin(self, statistic):
500500
class ContinuousFitAnalyticalMLEOverride(Benchmark):
501501
# list of distributions to time
502502
dists = ["pareto", "laplace", "rayleigh", "invgauss", "gumbel_r",
503-
"gumbel_l", "powerlaw"]
503+
"gumbel_l", "powerlaw", "lognorm"]
504504
# add custom values for rvs and fit, if desired, for any distribution:
505505
# key should match name in dists and value should be list of loc, scale,
506506
# and shapes
@@ -550,7 +550,9 @@ def setup(self, dist_name, case, loc_fixed, scale_fixed,
550550
self.fixed = dict(zip(compress(self.fnames, relevant_parameters),
551551
compress(fixed_vales, relevant_parameters)))
552552
self.param_values = param_values
553-
self.data = self.distn.rvs(*param_values, size=1000,
553+
# shapes need to come before loc and scale
554+
self.data = self.distn.rvs(*param_values[2:], *param_values[:2],
555+
size=1000,
554556
random_state=np.random.default_rng(4653465))
555557

556558
def time_fit(self, dist_name, case, loc_fixed, scale_fixed,

scipy/stats/_continuous_distns.py

+88-49
Original file line numberDiff line numberDiff line change
@@ -5870,68 +5870,107 @@ def _entropy(self, s):
58705870
this function uses explicit formulas for the maximum likelihood
58715871
estimation of the log-normal shape and scale parameters, so the
58725872
`optimizer`, `loc` and `scale` keyword arguments are ignored.
5873+
If the location is free, a likelihood maximum is found by
5874+
setting its partial derivative wrt to location to 0, and
5875+
solving by substituting the analytical expressions of shape
5876+
and scale (or provided parameters).
5877+
See, e.g., equation 3.1 in
5878+
A. Clifford Cohen & Betty Jones Whitten (1980)
5879+
Estimation in the Three-Parameter Lognormal Distribution,
5880+
Journal of the American Statistical Association, 75:370, 399-404
5881+
https://doi.org/10.2307/2287466
58735882
\n\n""")
58745883
def fit(self, data, *args, **kwds):
5875-
floc = kwds.get('floc', None)
5876-
5877-
if floc is None:
5878-
# loc is not fixed. Use the default fit method.
5884+
if kwds.pop('superfit', False):
58795885
return super().fit(data, *args, **kwds)
58805886

5881-
f0 = (kwds.get('f0', None) or kwds.get('fs', None) or
5882-
kwds.get('fix_s', None))
5883-
fscale = kwds.get('fscale', None)
5884-
5885-
if len(args) > 1:
5886-
raise TypeError("Too many input arguments.")
5887-
for name in ['f0', 'fs', 'fix_s', 'floc', 'fscale', 'loc', 'scale',
5888-
'optimizer', 'method']:
5889-
kwds.pop(name, None)
5890-
if kwds:
5891-
raise TypeError("Unknown arguments: %s." % kwds)
5887+
parameters = _check_fit_input_parameters(self, data, args, kwds)
5888+
data, fshape, floc, fscale = parameters
5889+
data_min = np.min(data)
5890+
5891+
def get_shape_scale(loc):
5892+
# Calculate maximum likelihood scale and shape with analytical
5893+
# formulas unless provided by the user
5894+
if fshape is None or fscale is None:
5895+
lndata = np.log(data - loc)
5896+
scale = fscale or np.exp(lndata.mean())
5897+
shape = fshape or np.sqrt(np.mean((lndata - np.log(scale))**2))
5898+
return shape, scale
5899+
5900+
def dL_dLoc(loc):
5901+
# Derivative of (positive) LL w.r.t. loc
5902+
shape, scale = get_shape_scale(loc)
5903+
shifted = data - loc
5904+
return np.sum((1 + np.log(shifted/scale)/shape**2)/shifted)
5905+
5906+
def ll(loc):
5907+
# (Positive) log-likelihood
5908+
shape, scale = get_shape_scale(loc)
5909+
return -self.nnlf((shape, loc, scale), data)
58925910

5893-
# Special case: loc is fixed. Use the maximum likelihood formulas
5894-
# instead of the numerical solver.
5911+
if floc is None:
5912+
# The location must be less than the minimum of the data.
5913+
# Back off a bit to avoid numerical issues.
5914+
spacing = np.spacing(data_min)
5915+
rbrack = data_min - spacing
5916+
5917+
# Find the right end of the bracket by successive doubling of the
5918+
# distance to data_min. We're interested in a maximum LL, so the
5919+
# slope dL_dLoc_rbrack should be negative at the right end.
5920+
# optimization for later: share shape, scale
5921+
dL_dLoc_rbrack = dL_dLoc(rbrack)
5922+
ll_rbrack = ll(rbrack)
5923+
delta = 2 * spacing # 2 * (data_min - rbrack)
5924+
while dL_dLoc_rbrack >= -1e-6:
5925+
rbrack = data_min - delta
5926+
dL_dLoc_rbrack = dL_dLoc(rbrack)
5927+
delta *= 2
58955928

5896-
if f0 is not None and fscale is not None:
5897-
# This check is for consistency with `rv_continuous.fit`.
5898-
raise ValueError("All parameters fixed. There is nothing to "
5899-
"optimize.")
5929+
if not np.isfinite(rbrack) or not np.isfinite(dL_dLoc_rbrack):
5930+
# If we never find a negative slope, either we missed it or the
5931+
# slope is always positive. It's usually the latter,
5932+
# which means
5933+
# loc = data_min - spacing
5934+
# But sometimes when shape and/or scale are fixed there are
5935+
# other issues, so be cautious.
5936+
return super().fit(data, *args, **kwds)
59005937

5901-
data = np.asarray(data)
5938+
# Now find the left end of the bracket. Guess is `rbrack-1`
5939+
# unless that is too small of a difference to resolve. Double
5940+
# the size of the interval until the left end is found.
5941+
lbrack = np.minimum(np.nextafter(rbrack, -np.inf), rbrack-1)
5942+
dL_dLoc_lbrack = dL_dLoc(lbrack)
5943+
delta = 2 * (rbrack - lbrack)
5944+
while (np.isfinite(lbrack) and np.isfinite(dL_dLoc_lbrack)
5945+
and np.sign(dL_dLoc_lbrack) == np.sign(dL_dLoc_rbrack)):
5946+
lbrack = rbrack - delta
5947+
dL_dLoc_lbrack = dL_dLoc(lbrack)
5948+
delta *= 2
59025949

5903-
if not np.isfinite(data).all():
5904-
raise ValueError("The data contains non-finite values.")
5950+
# I don't recall observing this, but just in case...
5951+
if not np.isfinite(lbrack) or not np.isfinite(dL_dLoc_lbrack):
5952+
return super().fit(data, *args, **kwds)
59055953

5906-
floc = float(floc)
5907-
if floc != 0:
5908-
# Shifting the data by floc. Don't do the subtraction in-place,
5909-
# because `data` might be a view of the input array.
5910-
data = data - floc
5911-
if np.any(data <= 0):
5912-
raise FitDataError("lognorm", lower=floc, upper=np.inf)
5913-
lndata = np.log(data)
5954+
# If we have a valid bracket, find the root
5955+
res = root_scalar(dL_dLoc, bracket=(lbrack, rbrack))
5956+
if not res.converged:
5957+
return super().fit(data, *args, **kwds)
59145958

5915-
# Three cases to handle:
5916-
# * shape and scale both free
5917-
# * shape fixed, scale free
5918-
# * shape free, scale fixed
5959+
# If the slope was positive near the minimum of the data,
5960+
# the maximum LL could be there instead of at the root. Compare
5961+
# the LL of the two points to decide.
5962+
ll_root = ll(res.root)
5963+
loc = res.root if ll_root > ll_rbrack else data_min-spacing
59195964

5920-
if fscale is None:
5921-
# scale is free.
5922-
scale = np.exp(lndata.mean())
5923-
if f0 is None:
5924-
# shape is free.
5925-
shape = lndata.std()
5926-
else:
5927-
# shape is fixed.
5928-
shape = float(f0)
59295965
else:
5930-
# scale is fixed, shape is free
5931-
scale = float(fscale)
5932-
shape = np.sqrt(((lndata - np.log(scale))**2).mean())
5966+
if floc >= data_min:
5967+
raise FitDataError("lognorm", lower=0., upper=np.inf)
5968+
loc = floc
59335969

5934-
return shape, floc, scale
5970+
shape, scale = get_shape_scale(loc)
5971+
if not (self._argcheck(shape) and scale > 0):
5972+
return super().fit(data, *args, **kwds)
5973+
return shape, loc, scale
59355974

59365975

59375976
lognorm = lognorm_gen(a=0.0, name='lognorm')

scipy/stats/tests/test_distributions.py

+28
Original file line numberDiff line numberDiff line change
@@ -3561,6 +3561,34 @@ def test_logcdf(self):
35613561
assert_allclose(stats.lognorm.logsf(x2-mu, s=sigma),
35623562
stats.norm.logsf(np.log(x2-mu)/sigma))
35633563

3564+
@pytest.fixture(scope='function')
3565+
def rng(self):
3566+
return np.random.default_rng(1234)
3567+
3568+
@pytest.mark.parametrize("rvs_shape", [.1, 2])
3569+
@pytest.mark.parametrize("rvs_loc", [-2, 0, 2])
3570+
@pytest.mark.parametrize("rvs_scale", [.2, 1, 5])
3571+
@pytest.mark.parametrize('fix_shape, fix_loc, fix_scale',
3572+
[e for e in product((False, True), repeat=3)
3573+
if False in e])
3574+
@np.errstate(invalid="ignore")
3575+
def test_fit_MLE_comp_optimzer(self, rvs_shape, rvs_loc, rvs_scale,
3576+
fix_shape, fix_loc, fix_scale, rng):
3577+
data = stats.lognorm.rvs(size=100, s=rvs_shape, scale=rvs_scale,
3578+
loc=rvs_loc, random_state=rng)
3579+
args = [data, (stats.lognorm._fitstart(data), )]
3580+
func = stats.lognorm._reduce_func(args, {})[1]
3581+
3582+
kwds = {}
3583+
if fix_shape:
3584+
kwds['f0'] = rvs_shape
3585+
if fix_loc:
3586+
kwds['floc'] = rvs_loc
3587+
if fix_scale:
3588+
kwds['fscale'] = rvs_scale
3589+
3590+
_assert_less_or_close_loglike(stats.lognorm, data, func, **kwds)
3591+
35643592

35653593
class TestBeta:
35663594
def test_logpdf(self):

0 commit comments

Comments
 (0)