sklearn/examples/model_selection/plot_cv_predict.py

"""
====================================
Plotting Cross-Validated Predictions
====================================

This example shows how to use
:func:`~sklearn.model_selection.cross_val_predict` together with
:class:`~sklearn.metrics.PredictionErrorDisplay` to visualize prediction
errors.
"""

# %%
# We will load the diabetes dataset and create an instance of a linear
# regression model.
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

X, y = load_diabetes(return_X_y=True)
lr = LinearRegression()

# %%
# :func:`~sklearn.model_selection.cross_val_predict` returns an array of the
# same size of `y` where each entry is a prediction obtained by cross
# validation.
from sklearn.model_selection import cross_val_predict

y_pred = cross_val_predict(lr, X, y, cv=10)

# %%
# Since `cv=10`, it means that we trained 10 models and each model was
# used to predict on one of the 10 folds. We can now use the
# :class:`~sklearn.metrics.PredictionErrorDisplay` to visualize the
# prediction errors.
#
# On the left axis, we plot the observed values :math:`y` vs. the predicted
# values :math:`\hat{y}` given by the models. On the right axis, we plot the
# residuals (i.e. the difference between the observed values and the predicted
# values) vs. the predicted values.
import matplotlib.pyplot as plt

from sklearn.metrics import PredictionErrorDisplay

fig, axs = plt.subplots(ncols=2, figsize=(8, 4))
PredictionErrorDisplay.from_predictions(
    y,
    y_pred=y_pred,
    kind="actual_vs_predicted",
    subsample=100,
    ax=axs[0],
    random_state=0,
)
axs[0].set_title("Actual vs. Predicted values")
PredictionErrorDisplay.from_predictions(
    y,
    y_pred=y_pred,
    kind="residual_vs_predicted",
    subsample=100,
    ax=axs[1],
    random_state=0,
)
axs[1].set_title("Residuals vs. Predicted Values")
fig.suptitle("Plotting cross-validated predictions")
plt.tight_layout()
plt.show()

# %%
# It is important to note that we used
# :func:`~sklearn.model_selection.cross_val_predict` for visualization
# purpose only in this example.
#
# It would be problematic to
# quantitatively assess the model performance by computing a single
# performance metric from the concatenated predictions returned by
# :func:`~sklearn.model_selection.cross_val_predict`
# when the different CV folds vary by size and distributions.
#
# It is recommended to compute per-fold performance metrics using:
# :func:`~sklearn.model_selection.cross_val_score` or
# :func:`~sklearn.model_selection.cross_validate` instead.
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`====================================`
			`Plotting Cross-Validated Predictions`
			`====================================`

			`This example shows how to use`
			:func:`~sklearn.model_selection.cross_val_predict` together with
			:class:`~sklearn.metrics.PredictionErrorDisplay` to visualize prediction
			`errors.`
			`"""`

			`# %%`
			`# We will load the diabetes dataset and create an instance of a linear`
			`# regression model.`
			`from sklearn.datasets import load_diabetes`
			`from sklearn.linear_model import LinearRegression`

			`X, y = load_diabetes(return_X_y=True)`
			`lr = LinearRegression()`

			`# %%`
			# :func:`~sklearn.model_selection.cross_val_predict` returns an array of the
			# same size of `y` where each entry is a prediction obtained by cross
			`# validation.`
			`from sklearn.model_selection import cross_val_predict`

			`y_pred = cross_val_predict(lr, X, y, cv=10)`

			`# %%`
			# Since `cv=10`, it means that we trained 10 models and each model was
			`# used to predict on one of the 10 folds. We can now use the`
			# :class:`~sklearn.metrics.PredictionErrorDisplay` to visualize the
			`# prediction errors.`
			`#`
			# On the left axis, we plot the observed values :math:`y` vs. the predicted
			# values :math:`\hat{y}` given by the models. On the right axis, we plot the
			`# residuals (i.e. the difference between the observed values and the predicted`
			`# values) vs. the predicted values.`
			`import matplotlib.pyplot as plt`

			`from sklearn.metrics import PredictionErrorDisplay`

			`fig, axs = plt.subplots(ncols=2, figsize=(8, 4))`
			`PredictionErrorDisplay.from_predictions(`
			`y,`
			`y_pred=y_pred,`
			`kind="actual_vs_predicted",`
			`subsample=100,`
			`ax=axs[0],`
			`random_state=0,`
			`)`
			`axs[0].set_title("Actual vs. Predicted values")`
			`PredictionErrorDisplay.from_predictions(`
			`y,`
			`y_pred=y_pred,`
			`kind="residual_vs_predicted",`
			`subsample=100,`
			`ax=axs[1],`
			`random_state=0,`
			`)`
			`axs[1].set_title("Residuals vs. Predicted Values")`
			`fig.suptitle("Plotting cross-validated predictions")`
			`plt.tight_layout()`
			`plt.show()`

			`# %%`
			`# It is important to note that we used`
			# :func:`~sklearn.model_selection.cross_val_predict` for visualization
			`# purpose only in this example.`
			`#`
			`# It would be problematic to`
			`# quantitatively assess the model performance by computing a single`
			`# performance metric from the concatenated predictions returned by`
			# :func:`~sklearn.model_selection.cross_val_predict`
			`# when the different CV folds vary by size and distributions.`
			`#`
			`# It is recommended to compute per-fold performance metrics using:`
			# :func:`~sklearn.model_selection.cross_val_score` or
			# :func:`~sklearn.model_selection.cross_validate` instead.