Source code for backbone_learn.heuristic_solvers.lasso_regression

# Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas
# Licensed under the MIT License.

import numpy as np
from sklearn.linear_model import LassoCV

from .heauristic_solver_base import HeuristicSolverBase


[docs]class LassoRegression(HeuristicSolverBase): """ Implements Lasso regression for feature selection using cross-validation. This class uses Lasso (Least Absolute Shrinkage and Selection Operator) regression, which is a type of linear regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters). Attributes: _model (LassoCV): The LassoCV regression model. _mse_score (float): The mean squared error score of the trained model. """ def __init__(self, **kwargs): """ Initializes the LassoRegression with specified cross-validation folds and random state. Args: cv_folds (int): The number of cross-validation folds to use. Default is 5. """ self._model = LassoCV() self._mse_score = None @property def mse_score(self) -> float: """ Returns the mean squared error score of the trained model. Returns: float: The mean squared error score. """ return self._mse_score
[docs] def fit( self, X: np.ndarray, y: np.ndarray, alphas=None, max_iter=1000, tol=0.0001, selection="cyclic", cv_folds=5, random_state=0, ) -> None: """ Fits a sparse regression model to the data using LassoCV. Args: X (np.ndarray): The input feature matrix. y (np.ndarray): The target variable. alphas (array-like, optional): List of alphas where to compute the models. If None alphas are set automatically. max_iter (int): The maximum number of iterations. tol (float): The tolerance for the optimization. selection (str): If set to 'random', a random coefficient is updated every iteration. """ self._model.set_params( cv=cv_folds, random_state=random_state, alphas=alphas, max_iter=max_iter, tol=tol, selection=selection, ) self._model.fit(X, y) # Fit the _model on the dataset
[docs] def get_relevant_variables(self, threshold: float) -> np.ndarray: """ Identifies features with coefficients greater than a specified threshold. Args: threshold (float): The threshold for determining feature relevance. Returns: np.ndarray: Indices of features whose coefficients are above the threshold. """ significant_indices = np.where(np.abs(self._model.coef_) > threshold)[0] return significant_indices
[docs] def predict(self, X: np.ndarray) -> np.ndarray: """ Predicts the target values for the given data using the trained Lasso model. Args: X (np.ndarray): The input feature matrix. Returns: np.ndarray: The predicted target values. """ return self._model.predict(X)
[docs] def keep_top_features(self, n_non_zeros: int) -> None: """ Retain only the top 'n_non_zeros' features in the Lasso model. Args: n_non_zeros (int): Number of features to retain. """ # Get the absolute values of the coefficients coef_magnitude = np.abs(self._model.coef_) # Find the threshold for the top 'n_non_zeros' coefficients threshold = ( np.sort(coef_magnitude)[-n_non_zeros] if n_non_zeros < len(coef_magnitude) else 0 ) # Zero out coefficients that are below the threshold self._model.coef_[coef_magnitude < threshold] = 0