|
8 | 8 | """
|
9 | 9 |
|
10 | 10 | import warnings
|
| 11 | +from typing import Tuple |
11 | 12 |
|
12 | 13 | import numpy as np
|
13 | 14 | import pandas as pd
|
14 | 15 | import scipy as sp
|
15 | 16 | from scipy.optimize import minimize
|
16 | 17 | from scipy.stats import chi2, pearsonr
|
17 | 18 | from sklearn.base import BaseEstimator, TransformerMixin
|
| 19 | +from sklearn.linear_models import LinearRegression |
18 | 20 | from sklearn.utils import check_array
|
19 | 21 | from sklearn.utils.extmath import randomized_svd
|
20 | 22 | from sklearn.utils.validation import check_is_fitted
|
21 | 23 |
|
22 | 24 | from .rotator import OBLIQUE_ROTATIONS, POSSIBLE_ROTATIONS, Rotator
|
23 |
| -from .utils import corr, impute_values, partial_correlations, smc |
| 25 | +from .utils import corr, covariance_to_correlation, impute_values, partial_correlations, smc |
24 | 26 |
|
25 | 27 | POSSIBLE_SVDS = ['randomized', 'lapack']
|
26 | 28 |
|
@@ -114,6 +116,70 @@ def calculate_bartlett_sphericity(x):
|
114 | 116 | return statistic, p_value
|
115 | 117 |
|
116 | 118 |
|
| 119 | +def calculate_cng_indices( |
| 120 | + data: np.ndarray, model: str = "components" |
| 121 | +) -> Tuple[int, pd.DataFrame]: |
| 122 | + """Calculate the Cattel-Nelson-Gorsuch indices, which are used to determine |
| 123 | + the appropriate number of factors for a factor analysis. |
| 124 | +
|
| 125 | + Direct port of nCng function from nFactors package: |
| 126 | + https://rdrr.io/cran/nFactors/man/nCng.html |
| 127 | +
|
| 128 | + Parameters |
| 129 | + ---------- |
| 130 | + data : array-like |
| 131 | + The array of samples x observable for which to calculate CNG indices |
| 132 | + model : str |
| 133 | + "components" or "factors" |
| 134 | +
|
| 135 | + Returns |
| 136 | + ------- |
| 137 | + num_factors : int |
| 138 | + The number of components/factors to retain |
| 139 | + details : pd.DataFrame |
| 140 | + The eigenvalues and CNG indices of the dataset |
| 141 | + """ |
| 142 | + data = corr(data.values) |
| 143 | + if model == "factors": |
| 144 | + data -= np.linalg.pinv(np.diag(np.diag(np.linalg.pinv(data)))) |
| 145 | + # TODO: Should this line be here? |
| 146 | + data = covariance_to_correlation(data) |
| 147 | + |
| 148 | + values = np.sort(np.linalg.eigvals(data))[::-1] |
| 149 | + |
| 150 | + num_variables = len(data) |
| 151 | + if num_variables < 6: |
| 152 | + raise ValueError("The number of variables must be at least 6") |
| 153 | + |
| 154 | + fit_size = 3 |
| 155 | + cng = np.diff( |
| 156 | + [ |
| 157 | + [ |
| 158 | + LinearRegression() |
| 159 | + .fit(idx_values[:, np.newaxis], values[idx_values]) |
| 160 | + .coef_ |
| 161 | + for idx_values in [ |
| 162 | + np.arange(idx_fit, idx_fit + fit_size), |
| 163 | + np.arange(idx_fit + fit_size, idx_fit + 2 * fit_size), |
| 164 | + ] |
| 165 | + ] |
| 166 | + for idx_fit in range(num_variables - 2 * fit_size) |
| 167 | + ], |
| 168 | + axis=1, |
| 169 | + ).squeeze(axis=(1, 2)) |
| 170 | + |
| 171 | + num_factors = np.nanargmax(cng) + fit_size |
| 172 | + |
| 173 | + details = pd.DataFrame( |
| 174 | + { |
| 175 | + "data": values[: len(cng)], |
| 176 | + "cng": cng, |
| 177 | + } |
| 178 | + ) |
| 179 | + |
| 180 | + return num_factors, details |
| 181 | + |
| 182 | + |
117 | 183 | class FactorAnalyzer(BaseEstimator, TransformerMixin):
|
118 | 184 | """
|
119 | 185 | A FactorAnalyzer class, which -
|
|
0 commit comments