From 9143d0e157877cb3577a570ed51111879256a1c6 Mon Sep 17 00:00:00 2001 From: Sami Jawhar Date: Mon, 14 Mar 2022 20:35:44 -0700 Subject: [PATCH 1/2] Add calculate_cng_indices --- factor_analyzer/factor_analyzer.py | 68 +++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/factor_analyzer/factor_analyzer.py b/factor_analyzer/factor_analyzer.py index dcf6088..d20cfc0 100644 --- a/factor_analyzer/factor_analyzer.py +++ b/factor_analyzer/factor_analyzer.py @@ -8,6 +8,7 @@ """ import warnings +from typing import Tuple import numpy as np import pandas as pd @@ -15,12 +16,13 @@ from scipy.optimize import minimize from scipy.stats import chi2, pearsonr from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.linear_models import LinearRegression from sklearn.utils import check_array from sklearn.utils.extmath import randomized_svd from sklearn.utils.validation import check_is_fitted from .rotator import OBLIQUE_ROTATIONS, POSSIBLE_ROTATIONS, Rotator -from .utils import corr, impute_values, partial_correlations, smc +from .utils import corr, covariance_to_correlation, impute_values, partial_correlations, smc POSSIBLE_SVDS = ['randomized', 'lapack'] @@ -114,6 +116,70 @@ def calculate_bartlett_sphericity(x): return statistic, p_value +def calculate_cng_indices( + data: np.ndarray, model: str = "components" +) -> Tuple[int, pd.DataFrame]: + """Calculate the Cattel-Nelson-Gorsuch indices, which are used to determine + the appropriate number of factors for a factor analysis. + + Direct port of nCng function from nFactors package: + https://rdrr.io/cran/nFactors/man/nCng.html + + Parameters + ---------- + data : array-like + The array of samples x observable for which to calculate CNG indices + model : str + "components" or "factors" + + Returns + ------- + num_factors : int + The number of components/factors to retain + details : pd.DataFrame + The eigenvalues and CNG indices of the dataset + """ + data = corr(data) + if model == "factors": + data -= np.linalg.pinv(np.diag(np.diag(np.linalg.pinv(data)))) + # TODO: Should this line be here? + data = covariance_to_correlation(data) + + values = np.sort(np.linalg.eigvals(data))[::-1] + + num_variables = len(data) + if num_variables < 6: + raise ValueError("The number of variables must be at least 6") + + fit_size = 3 + cng = np.diff( + [ + [ + LinearRegression() + .fit(idx_values[:, np.newaxis], values[idx_values]) + .coef_ + for idx_values in [ + np.arange(idx_fit, idx_fit + fit_size), + np.arange(idx_fit + fit_size, idx_fit + 2 * fit_size), + ] + ] + for idx_fit in range(num_variables - 2 * fit_size) + ], + axis=1, + ).squeeze(axis=(1, 2)) + + num_factors = np.nanargmax(cng) + fit_size + + details = pd.DataFrame( + { + "data": values[: len(cng)], + "cng": cng, + } + ) + + return num_factors, details + + class FactorAnalyzer(BaseEstimator, TransformerMixin): """ A FactorAnalyzer class, which - From c1e57d2f8a2e41e458e3977cccc53f58e76cf27c Mon Sep 17 00:00:00 2001 From: Sami Jawhar Date: Fri, 18 Mar 2022 20:40:51 -0700 Subject: [PATCH 2/2] Address PR comments --- factor_analyzer/factor_analyzer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/factor_analyzer/factor_analyzer.py b/factor_analyzer/factor_analyzer.py index d20cfc0..be128cb 100644 --- a/factor_analyzer/factor_analyzer.py +++ b/factor_analyzer/factor_analyzer.py @@ -142,8 +142,8 @@ def calculate_cng_indices( data = corr(data) if model == "factors": data -= np.linalg.pinv(np.diag(np.diag(np.linalg.pinv(data)))) - # TODO: Should this line be here? - data = covariance_to_correlation(data) + elif model != "components": + raise ValueError(f"model must be one of 'factors' or 'components'. Received '{model}'") values = np.sort(np.linalg.eigvals(data))[::-1]