80 lines
2.5 KiB
Python
80 lines
2.5 KiB
Python
|
"""
|
||
|
====================================
|
||
|
Plotting Cross-Validated Predictions
|
||
|
====================================
|
||
|
|
||
|
This example shows how to use
|
||
|
:func:`~sklearn.model_selection.cross_val_predict` together with
|
||
|
:class:`~sklearn.metrics.PredictionErrorDisplay` to visualize prediction
|
||
|
errors.
|
||
|
"""
|
||
|
|
||
|
# %%
|
||
|
# We will load the diabetes dataset and create an instance of a linear
|
||
|
# regression model.
|
||
|
from sklearn.datasets import load_diabetes
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
|
||
|
X, y = load_diabetes(return_X_y=True)
|
||
|
lr = LinearRegression()
|
||
|
|
||
|
# %%
|
||
|
# :func:`~sklearn.model_selection.cross_val_predict` returns an array of the
|
||
|
# same size of `y` where each entry is a prediction obtained by cross
|
||
|
# validation.
|
||
|
from sklearn.model_selection import cross_val_predict
|
||
|
|
||
|
y_pred = cross_val_predict(lr, X, y, cv=10)
|
||
|
|
||
|
# %%
|
||
|
# Since `cv=10`, it means that we trained 10 models and each model was
|
||
|
# used to predict on one of the 10 folds. We can now use the
|
||
|
# :class:`~sklearn.metrics.PredictionErrorDisplay` to visualize the
|
||
|
# prediction errors.
|
||
|
#
|
||
|
# On the left axis, we plot the observed values :math:`y` vs. the predicted
|
||
|
# values :math:`\hat{y}` given by the models. On the right axis, we plot the
|
||
|
# residuals (i.e. the difference between the observed values and the predicted
|
||
|
# values) vs. the predicted values.
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
from sklearn.metrics import PredictionErrorDisplay
|
||
|
|
||
|
fig, axs = plt.subplots(ncols=2, figsize=(8, 4))
|
||
|
PredictionErrorDisplay.from_predictions(
|
||
|
y,
|
||
|
y_pred=y_pred,
|
||
|
kind="actual_vs_predicted",
|
||
|
subsample=100,
|
||
|
ax=axs[0],
|
||
|
random_state=0,
|
||
|
)
|
||
|
axs[0].set_title("Actual vs. Predicted values")
|
||
|
PredictionErrorDisplay.from_predictions(
|
||
|
y,
|
||
|
y_pred=y_pred,
|
||
|
kind="residual_vs_predicted",
|
||
|
subsample=100,
|
||
|
ax=axs[1],
|
||
|
random_state=0,
|
||
|
)
|
||
|
axs[1].set_title("Residuals vs. Predicted Values")
|
||
|
fig.suptitle("Plotting cross-validated predictions")
|
||
|
plt.tight_layout()
|
||
|
plt.show()
|
||
|
|
||
|
# %%
|
||
|
# It is important to note that we used
|
||
|
# :func:`~sklearn.model_selection.cross_val_predict` for visualization
|
||
|
# purpose only in this example.
|
||
|
#
|
||
|
# It would be problematic to
|
||
|
# quantitatively assess the model performance by computing a single
|
||
|
# performance metric from the concatenated predictions returned by
|
||
|
# :func:`~sklearn.model_selection.cross_val_predict`
|
||
|
# when the different CV folds vary by size and distributions.
|
||
|
#
|
||
|
# It is recommended to compute per-fold performance metrics using:
|
||
|
# :func:`~sklearn.model_selection.cross_val_score` or
|
||
|
# :func:`~sklearn.model_selection.cross_validate` instead.
|