PlnPCA

Bases: _model

PlnPCA object where the covariance has low rank.

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data, get_simulation_parameters, sample_pln
>>> endog, labels = get_real_count_data(return_labels = True)
>>> data = {"endog": endog}
>>> pca = PlnPCA.from_formula("endog ~ 1", data = data, rank = 5)
>>> pca.fit()
>>> print(pca)
>>> pca.viz(colors = labels)

>>> plnparam = get_simulation_parameters(n_samples =100, dim = 60, nb_cov = 2, rank = 8)
>>> endog = sample_pln(plnparam)
>>> data = {"endog": endog, "cov": plnparam.exog, "offsets": plnparam.offsets}
>>> plnpca = PlnPCA.from_formula("endog ~ 0 + cov", data = data, rank = 5)
>>> plnpca.fit()
>>> print(plnpca)

See also

pyPLNmodels.PlnPCA.from_formula()

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog= get_real_count_data()
>>> pca = PlnPCA(endog, add_const = True)
>>> pca.fit()
>>> print(pca)

property batch_size: int: The batch size of the model. Should not be greater than the number of samples.

property coef

Property representing the coefficients.

Returns:: The coefficients or None.
Return type:: torch.Tensor or None

property components: Tensor

Property representing the components.

Returns:: The components.
Return type:: torch.Tensor

compute_elbo() → Tensor

Compute the Evidence Lower BOund (ELBO) that will be maximized by pytorch.

Returns:: The computed ELBO.
Return type:: torch.Tensor

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog = get_real_count_data(return_labels = False)
>>> pca = PlnPCA(endog,add_const = True)
>>> pca.fit()
>>> elbo = pca.compute_elbo()
>>> print("elbo", elbo)
>>> print("loglike/n", pca.loglike/pca.n_samples)

property covariance: Tensor

Property representing the covariance of the latent variables.

Returns:: The covariance tensor or None if components are not present.
Return type:: Optional[torch.Tensor]

property covariance_a_posteriori: Tensor | None

Property representing the covariance a posteriori of the latent variables.

Returns:: The covariance tensor or None if components are not present.
Return type:: Optional[torch.Tensor]

property dict_data

Property representing the data dictionary.

Returns:: The dictionary of data.
Return type:: dict

property dim: int

The second dimension of the endog.

Returns:: The second dimension of the endog.
Return type:: int

display_covariance(ax=None, savefig=False, name_file='')

Display the covariance matrix.

Parameters:

ax (matplotlib.axes.Axes, optional) – The axes to plot on. If None, a new figure will be created. Defaults to None.
savefig (bool, optional) – Whether to save the figure. Defaults to False.
name_file (str, optional) – The name of the file to save. Defaults to “”.

property endog

Property representing the endog.

Returns:: The endog or None.
Return type:: torch.Tensor or None

property exog: Tensor

Property representing the exog.

Returns:: The exog tensor.
Return type:: torch.Tensor

fit(nb_max_iteration: int = 50000, *, lr: float = 0.01, tol: float = 0.001, do_smart_init: bool = True, verbose: bool = False, batch_size=None)

Fit the model. The lower tol, the more accurate the model.

Parameters:

nb_max_iteration (int, optional) – The maximum number of iterations. Defaults to 50000.
lr (float, optional(keyword-only)) – The learning rate. Defaults to 0.01.
tol (float, optional(keyword-only)) – The tolerance for convergence. Defaults to 1e-8.
do_smart_init (bool, optional(keyword-only)) – Whether to perform smart initialization. Defaults to True.
verbose (bool, optional(keyword-only)) – Whether to print training progress. Defaults to False.
batch_size (int, optional(keyword-only)) – The batch size when optimizing the elbo. If None, batch gradient descent will be performed (i.e. batch_size = n_samples).

Raises:

ValueError – If the batch_size is greater than the number of samples, or not int.

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog = get_real_count_data()
>>> plnpca = PlnPCA(endog,add_const = True, rank = 6)
>>> plnpca.fit()
>>> print(plnpca)

property fitted: bool

Whether the model is fitted.

Returns:: True if the model is fitted, False otherwise.
Return type:: bool

classmethod from_formula(formula: str, data: Dict[str, Tensor | ndarray | DataFrame], *, rank: int = 5, offsets_formula: str = 'logsum', dict_initialization: Dict[str, Tensor] | None = None)

Create a model instance from a formula and data.

Parameters:

formula (str) – The formula.
data (dict) – The data dictionary. Each value can be either a torch.Tensor, a np.ndarray or pd.DataFrame
offsets_formula (str, optional(keyword-only)) – The formula for offsets. Defaults to “logsum”.
dict_initialization (dict, optional(keyword-only)) – The initialization dictionary. Defaults to None.
take_log_offsets (bool, optional(keyword-only)) – Whether to take the log of offsets. Defaults to False.
rank (int, optional(keyword-only)) – The rank of the approximation, by default 5.

Return type:

PlnPCA

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog = get_real_count_data()
>>> data = {"endog": endog}
>>> pca = PlnPCA.from_formula("endog ~ 1", data = data, rank = 5)

property latent_mean: Tensor

Property representing the latent mean.

Returns:: The latent mean or None if it has not yet been initialized.
Return type:: torch.Tensor or None

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog = get_real_count_data()
>>> data = {"endog": endog}
>>> plnpca = PlnPCA.from_formula("endog ~ 1", data = data)
>>> plnpca.fit()
>>> print(plnpca.latent_mean.shape)

property latent_parameters

Property representing the latent parameters.

Returns:: The dictionary of latent parameters.
Return type:: dict

property latent_sqrt_var

Property representing the latent variance.

Returns:: The latent variance or None.
Return type:: torch.Tensor or None

property latent_variables: Tensor

Property representing the latent variables.

Returns:: The latent variables of size (n_samples, dim).
Return type:: torch.Tensor

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog = get_real_count_data(return_labels=False)
>>> pca = PlnPCA(endog,add_const = True)
>>> pca.fit()
>>> print(pca.latent_variables.shape)

property latent_variance: Tensor

Property representing the latent variance.

Returns:: The latent variance tensor.
Return type:: torch.Tensor

property loglike

Property representing the log-likelihood.

Returns:: The log-likelihood.
Return type:: float

property model_parameters: Dict[str, Tensor]

Property representing the model parameters.

Returns:: The dictionary of model parameters.
Return type:: dict

property n_samples: int

The number of samples, i.e. the first dimension of the endog.

Returns:: The number of samples.
Return type:: int

property nb_batches

property nb_cov: int

The number of exog.

Returns:: The number of exog.
Return type:: int

property nb_iteration_done: int

The number of iterations done.

Returns:: The number of iterations done.
Return type:: int

property number_of_parameters: int

Property representing the number of parameters.

Returns:: The number of parameters.
Return type:: int

property offsets

Property representing the offsets.

Returns:: The offsets or None.
Return type:: torch.Tensor or None

property optim_parameters

Property representing the optimization parameters.

Returns:: The dictionary of optimization parameters.
Return type:: dict

property ortho_components: Orthogonal components of the model.

pca_projected_latent_variables(n_components: int | None = None)

Perform PCA on the latent variables and project them onto a lower-dimensional space.

Parameters:: n_components (int, optional) – The number of components to keep. If None, all components are kept. Defaults to None.
Returns:: The projected latent variables.
Return type:: numpy.ndarray
Raises:: ValueError – If the number of components asked is greater than the number of dimensions.

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog = get_real_count_data()
>>> data = {"endog": endog}
>>> plnpca = PlnPCA.from_formula("endog ~ 1", data = data)
>>> plnpca.fit()
>>> pca_proj = plnpca.pca_projected_latent_variables()
>>> print(pca_proj.shape)

plot_expected_vs_true(ax=None, colors=None)

Plot the predicted value of the endog against the endog.

Parameters:

ax (Optional[matplotlib.axes.Axes], optional) – The matplotlib axis to use. If None, the current axis is used, by default None.
colors (Optional[Any], optional) – The colors to use for plotting, by default None.

Returns:

matplotlib.axes.Axes – The matplotlib axis.
>>>

Examples

>>> import matplotlib.pyplot as plt
>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog, labels = get_real_count_data(return_labels = True)
>>> plnpca = PlnPCA(endog,add_const = True)
>>> plnpca.fit()
>>> plnpca.plot_expected_vs_true()
>>> plt.show()
>>> plnpca.plot_expected_vs_true(colors = labels)
>>> plt.show()

plot_pca_correlation_graph(variables_names: List[str], indices_of_variables=None)

Visualizes variables using PCA and plots a correlation graph.

Parameters:

variables_names (List[str]) – A list of variable names to visualize.
indices_of_variables (Optional[List[int]], optional) – A list of indices corresponding to the variables. If None, indices are determined based on column_endog, by default None

Raises:

ValueError – If indices_of_variables is None and column_endog is not set.
ValueError – If the length of indices_of_variables is different from the length of variables_names.

Return type:

None

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog = get_real_count_data()
>>> data = {"endog": endog}
>>> plnpca = PlnPCA.from_formula("endog ~ 1", data = data)
>>> plnpca.fit()
>>> plnpca.plot_pca_correlation_graph(["a","b"], indices_of_variables = [4,8])

predict(exog: Tensor | ndarray | DataFrame | None = None)

Method for making predictions.

Parameters:

exog (Union[torch.Tensor, np.ndarray, pd.DataFrame], optional) – The exog, by default None.

Returns:

The predicted values or None.

Return type:

torch.Tensor or None

Raises:

AttributeError – If there are no exog in the model but some are provided.
RuntimeError – If the shape of the exog is incorrect.

Notes

If exog is not provided and there are no exog in the model, None is returned.
If there are exog in the model, then the mean exog @ coef is returned.
If exog is provided, it should have the shape (_, nb_cov), where nb_cov is the number of exog.
The predicted values are obtained by multiplying the exog by the coefficients.

property projected_latent_variables: Tensor

Property representing the projected latent variables.

Returns:: The projected latent variables.
Return type:: torch.Tensor

qq_plots()

property rank: int

Property representing the rank.

Returns:: The rank.
Return type:: int

save(path: str | None = None)

Save the model parameters to disk.

Parameters:: path (str, optional) – The path of the directory to save the parameters, by default “./”.

scatter_pca_matrix(n_components=None, color=None)

Generates a scatter matrix plot based on Principal Component Analysis (PCA).

Parameters:

(int (n_components) – If not specified, the maximum number of components will be used. Defaults to None.
optional) (The number of components to consider for plotting.) – If not specified, the maximum number of components will be used. Defaults to None.
(str (color) – sample in the endog property of the object. Defaults to None.
np.ndarray) (An array with one label for each) – sample in the endog property of the object. Defaults to None.

Raises:

ValueError – If the number of components requested is greater than the number of variables in the dataset.:

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog = get_real_count_data()
>>> data = {"endog": endog}
>>> plnpca = PlnPCA.from_formula("endog ~ 1", data = data)
>>> plnpca.fit()
>>> plnpca.scatter_pca_matrix(n_components = 5)

show(axes=None)

Show 3 plots. The first one is the covariance of the model. The second one is the stopping criterion with the runtime in abscisse. The third one is the elbo.

Parameters:: axes (numpy.ndarray, optional) – The axes to plot on. If None, a new figure will be created. Defaults to None.

sigma()

Method returning the covariance matrix.

Returns:: The covariance matrix or None.
Return type:: torch.Tensor or None

sk_PCA(n_components=None)

Perform the scikit-learn PCA on the latent variables.

Parameters:: n_components (int, optional) – The number of components to keep. If None, all components are kept. Defaults to None.
Returns:: sklearn.decomposition.PCA object with all the features from sklearn.
Return type:: sklearn.decomposition.PCA
Raises:: ValueError – If the number of components asked is greater than the number of dimensions.

transform(project: bool = False) → Tensor

Method for transforming the endog. Can be seen as a normalization of the endog. :param project: Whether to project the latent variables, by default False. :type project: bool, optional

Returns:: The transformed endog (latent variables of the model).
Return type:: torch.Tensor

Examples

>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog= get_real_count_data()
>>> pca = PlnPCA(endog, add_const = True)
>>> pca.fit()
>>> transformed_endog_low_dim = pca.transform()
>>> transformed_endog_high_dim = pca.transform(project = False)
>>> print(transformed_endog_low_dim.shape)
>>> print(transformed_endog_high_dim.shape)

viz(ax: Axes | None = None, colors=None, show_cov: bool = False)

Visualize the latent variables with a classic PCA.

Parameters:

ax (Optional[matplotlib.axes.Axes], optional(keyword-only)) – The matplotlib axis to use. If None, the current axis is used, by default None.
colors (Optional[np.ndarray], optional(keyword-only)) – The colors to use for plotting, by default None.
show_cov (bool, Optional(keyword-only)) – If True, will display ellipses with right covariances. Default is False.

Raises:

RuntimeError – If the rank is less than 2.

Returns:

The matplotlib axis.

Return type:

Any

Examples

>>> import matplotlib.pyplot as plt
>>> from pyPLNmodels import PlnPCA, get_real_count_data
>>> endog, labels = get_real_count_data(return_labels = True)
>>> plnpca = PlnPCA(endog,add_const = True)
>>> plnpca.fit()
>>> plnpca.viz()
>>> plt.show()
>>> plnpca.viz(colors = labels)
>>> plt.show()
>>> plnpca.viz(show_cov = True)
>>> plt.show()