sklearn/examples/release_highlights/plot_release_highlights_1_5...

231 lines
8.3 KiB
Python
Raw Permalink Normal View History

2024-08-05 09:32:03 +02:00
# ruff: noqa
"""
=======================================
Release Highlights for scikit-learn 1.5
=======================================
.. currentmodule:: sklearn
We are pleased to announce the release of scikit-learn 1.5! Many bug fixes
and improvements were added, as well as some key new features. Below we
detail the highlights of this release. **For an exhaustive list of
all the changes**, please refer to the :ref:`release notes <release_notes_1_5>`.
To install the latest version (with pip)::
pip install --upgrade scikit-learn
or with conda::
conda install -c conda-forge scikit-learn
"""
# %%
# FixedThresholdClassifier: Setting the decision threshold of a binary classifier
# -------------------------------------------------------------------------------
# All binary classifiers of scikit-learn use a fixed decision threshold of 0.5
# to convert probability estimates (i.e. output of `predict_proba`) into class
# predictions. However, 0.5 is almost never the desired threshold for a given
# problem. :class:`~model_selection.FixedThresholdClassifier` allows wrapping any
# binary classifier and setting a custom decision threshold.
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay
X, y = make_classification(n_samples=10_000, weights=[0.9, 0.1], random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
classifier_05 = LogisticRegression(C=1e6, random_state=0).fit(X_train, y_train)
_ = ConfusionMatrixDisplay.from_estimator(classifier_05, X_test, y_test)
# %%
# Lowering the threshold, i.e. allowing more samples to be classified as the positive
# class, increases the number of true positives at the cost of more false positives
# (as is well known from the concavity of the ROC curve).
from sklearn.model_selection import FixedThresholdClassifier
classifier_01 = FixedThresholdClassifier(classifier_05, threshold=0.1)
classifier_01.fit(X_train, y_train)
_ = ConfusionMatrixDisplay.from_estimator(classifier_01, X_test, y_test)
# %%
# TunedThresholdClassifierCV: Tuning the decision threshold of a binary classifier
# --------------------------------------------------------------------------------
# The decision threshold of a binary classifier can be tuned to optimize a
# given metric, using :class:`~model_selection.TunedThresholdClassifierCV`.
#
# It is particularly useful to find the best decision threshold when the model
# is meant to be deployed in a specific application context where we can assign
# different gains or costs for true positives, true negatives, false positives,
# and false negatives.
#
# Let's illustrate this by considering an arbitrary case where:
#
# - each true positive gains 1 unit of profit, e.g. euro, year of life in good
# health, etc.;
# - true negatives gain or cost nothing;
# - each false negative costs 2;
# - each false positive costs 0.1.
#
# Our metric quantifies the average profit per sample, which is defined by the
# following Python function:
from sklearn.metrics import confusion_matrix
def custom_score(y_observed, y_pred):
tn, fp, fn, tp = confusion_matrix(y_observed, y_pred, normalize="all").ravel()
return tp - 2 * fn - 0.1 * fp
print("Untuned decision threshold: 0.5")
print(f"Custom score: {custom_score(y_test, classifier_05.predict(X_test)):.2f}")
# %%
# It is interesting to observe that the average gain per prediction is negative
# which means that this decision system is making a loss on average.
#
# Tuning the threshold to optimize this custom metric gives a smaller threshold
# that allows more samples to be classified as the positive class. As a result,
# the average gain per prediction improves.
from sklearn.model_selection import TunedThresholdClassifierCV
from sklearn.metrics import make_scorer
custom_scorer = make_scorer(
custom_score, response_method="predict", greater_is_better=True
)
tuned_classifier = TunedThresholdClassifierCV(
classifier_05, cv=5, scoring=custom_scorer
).fit(X, y)
print(f"Tuned decision threshold: {tuned_classifier.best_threshold_:.3f}")
print(f"Custom score: {custom_score(y_test, tuned_classifier.predict(X_test)):.2f}")
# %%
# We observe that tuning the decision threshold can turn a machine
# learning-based system that makes a loss on average into a beneficial one.
#
# In practice, defining a meaningful application-specific metric might involve
# making those costs for bad predictions and gains for good predictions depend on
# auxiliary metadata specific to each individual data point such as the amount
# of a transaction in a fraud detection system.
#
# To achieve this, :class:`~model_selection.TunedThresholdClassifierCV`
# leverages metadata routing support (:ref:`Metadata Routing User
# Guide<metadata_routing>`) allowing to optimize complex business metrics as
# detailed in :ref:`Post-tuning the decision threshold for cost-sensitive
# learning
# <sphx_glr_auto_examples_model_selection_plot_cost_sensitive_learning.py>`.
# %%
# Performance improvements in PCA
# -------------------------------
# :class:`~decomposition.PCA` has a new solver, `"covariance_eigh"`, which is
# up to an order of magnitude faster and more memory efficient than the other
# solvers for datasets with many data points and few features.
from sklearn.datasets import make_low_rank_matrix
from sklearn.decomposition import PCA
X = make_low_rank_matrix(
n_samples=10_000, n_features=100, tail_strength=0.1, random_state=0
)
pca = PCA(n_components=10, svd_solver="covariance_eigh").fit(X)
print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2f}")
# %%
# The new solver also accepts sparse input data:
from scipy.sparse import random
X = random(10_000, 100, format="csr", random_state=0)
pca = PCA(n_components=10, svd_solver="covariance_eigh").fit(X)
print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2f}")
# %%
# The `"full"` solver has also been improved to use less memory and allows
# faster transformation. The default `svd_solver="auto"`` option takes
# advantage of the new solver and is now able to select an appropriate solver
# for sparse datasets.
#
# Similarly to most other PCA solvers, the new `"covariance_eigh"` solver can leverage
# GPU computation if the input data is passed as a PyTorch or CuPy array by
# enabling the experimental support for :ref:`Array API <array_api>`.
# %%
# ColumnTransformer is subscriptable
# ----------------------------------
# The transformers of a :class:`~compose.ColumnTransformer` can now be directly
# accessed using indexing by name.
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
X = np.array([[0, 1, 2], [3, 4, 5]])
column_transformer = ColumnTransformer(
[("std_scaler", StandardScaler(), [0]), ("one_hot", OneHotEncoder(), [1, 2])]
)
column_transformer.fit(X)
print(column_transformer["std_scaler"])
print(column_transformer["one_hot"])
# %%
# Custom imputation strategies for the SimpleImputer
# --------------------------------------------------
# :class:`~impute.SimpleImputer` now supports custom strategies for imputation,
# using a callable that computes a scalar value from the non missing values of
# a column vector.
from sklearn.impute import SimpleImputer
X = np.array(
[
[-1.1, 1.1, 1.1],
[3.9, -1.2, np.nan],
[np.nan, 1.3, np.nan],
[-0.1, -1.4, -1.4],
[-4.9, 1.5, -1.5],
[np.nan, 1.6, 1.6],
]
)
def smallest_abs(arr):
"""Return the smallest absolute value of a 1D array."""
return np.min(np.abs(arr))
imputer = SimpleImputer(strategy=smallest_abs)
imputer.fit_transform(X)
# %%
# Pairwise distances with non-numeric arrays
# ------------------------------------------
# :func:`~metrics.pairwise_distances` can now compute distances between
# non-numeric arrays using a callable metric.
from sklearn.metrics import pairwise_distances
X = ["cat", "dog"]
Y = ["cat", "fox"]
def levenshtein_distance(x, y):
"""Return the Levenshtein distance between two strings."""
if x == "" or y == "":
return max(len(x), len(y))
if x[0] == y[0]:
return levenshtein_distance(x[1:], y[1:])
return 1 + min(
levenshtein_distance(x[1:], y),
levenshtein_distance(x, y[1:]),
levenshtein_distance(x[1:], y[1:]),
)
pairwise_distances(X, Y, metric=levenshtein_distance)