""" ====================================================== Effect of transforming the targets in regression model ====================================================== In this example, we give an overview of :class:`~sklearn.compose.TransformedTargetRegressor`. We use two examples to illustrate the benefit of transforming the targets before learning a linear regression model. The first example uses synthetic data while the second example is based on the Ames housing data set. """ # Author: Guillaume Lemaitre # License: BSD 3 clause print(__doc__) # %% # Synthetic example ################### # # A synthetic random regression dataset is generated. The targets ``y`` are # modified by: # # 1. translating all targets such that all entries are # non-negative (by adding the absolute value of the lowest ``y``) and # 2. applying an exponential function to obtain non-linear # targets which cannot be fitted using a simple linear model. # # Therefore, a logarithmic (`np.log1p`) and an exponential function # (`np.expm1`) will be used to transform the targets before training a linear # regression model and using it for prediction. import numpy as np from sklearn.datasets import make_regression X, y = make_regression(n_samples=10_000, noise=100, random_state=0) y = np.expm1((y + abs(y.min())) / 200) y_trans = np.log1p(y) # %% # Below we plot the probability density functions of the target # before and after applying the logarithmic functions. import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split f, (ax0, ax1) = plt.subplots(1, 2) ax0.hist(y, bins=100, density=True) ax0.set_xlim([0, 2000]) ax0.set_ylabel("Probability") ax0.set_xlabel("Target") ax0.set_title("Target distribution") ax1.hist(y_trans, bins=100, density=True) ax1.set_ylabel("Probability") ax1.set_xlabel("Target") ax1.set_title("Transformed target distribution") f.suptitle("Synthetic data", y=1.05) plt.tight_layout() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # %% # At first, a linear model will be applied on the original targets. Due to the # non-linearity, the model trained will not be precise during # prediction. Subsequently, a logarithmic function is used to linearize the # targets, allowing better prediction even with a similar linear model as # reported by the median absolute error (MedAE). from sklearn.metrics import median_absolute_error, r2_score def compute_score(y_true, y_pred): return { "R2": f"{r2_score(y_true, y_pred):.3f}", "MedAE": f"{median_absolute_error(y_true, y_pred):.3f}", } # %% from sklearn.compose import TransformedTargetRegressor from sklearn.linear_model import RidgeCV from sklearn.metrics import PredictionErrorDisplay f, (ax0, ax1) = plt.subplots(1, 2, sharey=True) ridge_cv = RidgeCV().fit(X_train, y_train) y_pred_ridge = ridge_cv.predict(X_test) ridge_cv_with_trans_target = TransformedTargetRegressor( regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1 ).fit(X_train, y_train) y_pred_ridge_with_trans_target = ridge_cv_with_trans_target.predict(X_test) PredictionErrorDisplay.from_predictions( y_test, y_pred_ridge, kind="actual_vs_predicted", ax=ax0, scatter_kwargs={"alpha": 0.5}, ) PredictionErrorDisplay.from_predictions( y_test, y_pred_ridge_with_trans_target, kind="actual_vs_predicted", ax=ax1, scatter_kwargs={"alpha": 0.5}, ) # Add the score in the legend of each axis for ax, y_pred in zip([ax0, ax1], [y_pred_ridge, y_pred_ridge_with_trans_target]): for name, score in compute_score(y_test, y_pred).items(): ax.plot([], [], " ", label=f"{name}={score}") ax.legend(loc="upper left") ax0.set_title("Ridge regression \n without target transformation") ax1.set_title("Ridge regression \n with target transformation") f.suptitle("Synthetic data", y=1.05) plt.tight_layout() # %% # Real-world data set ##################### # # In a similar manner, the Ames housing data set is used to show the impact # of transforming the targets before learning a model. In this example, the # target to be predicted is the selling price of each house. from sklearn.datasets import fetch_openml from sklearn.preprocessing import quantile_transform ames = fetch_openml(name="house_prices", as_frame=True) # Keep only numeric columns X = ames.data.select_dtypes(np.number) # Remove columns with NaN or Inf values X = X.drop(columns=["LotFrontage", "GarageYrBlt", "MasVnrArea"]) # Let the price be in k$ y = ames.target / 1000 y_trans = quantile_transform( y.to_frame(), n_quantiles=900, output_distribution="normal", copy=True ).squeeze() # %% # A :class:`~sklearn.preprocessing.QuantileTransformer` is used to normalize # the target distribution before applying a # :class:`~sklearn.linear_model.RidgeCV` model. f, (ax0, ax1) = plt.subplots(1, 2) ax0.hist(y, bins=100, density=True) ax0.set_ylabel("Probability") ax0.set_xlabel("Target") ax0.set_title("Target distribution") ax1.hist(y_trans, bins=100, density=True) ax1.set_ylabel("Probability") ax1.set_xlabel("Target") ax1.set_title("Transformed target distribution") f.suptitle("Ames housing data: selling price", y=1.05) plt.tight_layout() # %% X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # %% # The effect of the transformer is weaker than on the synthetic data. However, # the transformation results in an increase in :math:`R^2` and large decrease # of the MedAE. The residual plot (predicted target - true target vs predicted # target) without target transformation takes on a curved, 'reverse smile' # shape due to residual values that vary depending on the value of predicted # target. With target transformation, the shape is more linear indicating # better model fit. from sklearn.preprocessing import QuantileTransformer f, (ax0, ax1) = plt.subplots(2, 2, sharey="row", figsize=(6.5, 8)) ridge_cv = RidgeCV().fit(X_train, y_train) y_pred_ridge = ridge_cv.predict(X_test) ridge_cv_with_trans_target = TransformedTargetRegressor( regressor=RidgeCV(), transformer=QuantileTransformer(n_quantiles=900, output_distribution="normal"), ).fit(X_train, y_train) y_pred_ridge_with_trans_target = ridge_cv_with_trans_target.predict(X_test) # plot the actual vs predicted values PredictionErrorDisplay.from_predictions( y_test, y_pred_ridge, kind="actual_vs_predicted", ax=ax0[0], scatter_kwargs={"alpha": 0.5}, ) PredictionErrorDisplay.from_predictions( y_test, y_pred_ridge_with_trans_target, kind="actual_vs_predicted", ax=ax0[1], scatter_kwargs={"alpha": 0.5}, ) # Add the score in the legend of each axis for ax, y_pred in zip([ax0[0], ax0[1]], [y_pred_ridge, y_pred_ridge_with_trans_target]): for name, score in compute_score(y_test, y_pred).items(): ax.plot([], [], " ", label=f"{name}={score}") ax.legend(loc="upper left") ax0[0].set_title("Ridge regression \n without target transformation") ax0[1].set_title("Ridge regression \n with target transformation") # plot the residuals vs the predicted values PredictionErrorDisplay.from_predictions( y_test, y_pred_ridge, kind="residual_vs_predicted", ax=ax1[0], scatter_kwargs={"alpha": 0.5}, ) PredictionErrorDisplay.from_predictions( y_test, y_pred_ridge_with_trans_target, kind="residual_vs_predicted", ax=ax1[1], scatter_kwargs={"alpha": 0.5}, ) ax1[0].set_title("Ridge regression \n without target transformation") ax1[1].set_title("Ridge regression \n with target transformation") f.suptitle("Ames housing data: selling price", y=1.05) plt.tight_layout() plt.show()