sklearn/examples/ensemble/plot_voting_probas.py

"""
===========================================================
Plot class probabilities calculated by the VotingClassifier
===========================================================

.. currentmodule:: sklearn

Plot the class probabilities of the first sample in a toy dataset predicted by
three different classifiers and averaged by the
:class:`~ensemble.VotingClassifier`.

First, three exemplary classifiers are initialized
(:class:`~linear_model.LogisticRegression`, :class:`~naive_bayes.GaussianNB`,
and :class:`~ensemble.RandomForestClassifier`) and used to initialize a
soft-voting :class:`~ensemble.VotingClassifier` with weights `[1, 1, 5]`, which
means that the predicted probabilities of the
:class:`~ensemble.RandomForestClassifier` count 5 times as much as the weights
of the other classifiers when the averaged probability is calculated.

To visualize the probability weighting, we fit each classifier on the training
set and plot the predicted class probabilities for the first sample in this
example dataset.

"""

import matplotlib.pyplot as plt
import numpy as np

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

clf1 = LogisticRegression(max_iter=1000, random_state=123)
clf2 = RandomForestClassifier(n_estimators=100, random_state=123)
clf3 = GaussianNB()
X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
y = np.array([1, 1, 2, 2])

eclf = VotingClassifier(
    estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
    voting="soft",
    weights=[1, 1, 5],
)

# predict class probabilities for all classifiers
probas = [c.fit(X, y).predict_proba(X) for c in (clf1, clf2, clf3, eclf)]

# get class probabilities for the first sample in the dataset
class1_1 = [pr[0, 0] for pr in probas]
class2_1 = [pr[0, 1] for pr in probas]


# plotting

N = 4  # number of groups
ind = np.arange(N)  # group positions
width = 0.35  # bar width

fig, ax = plt.subplots()

# bars for classifier 1-3
p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color="green", edgecolor="k")
p2 = ax.bar(
    ind + width,
    np.hstack(([class2_1[:-1], [0]])),
    width,
    color="lightgreen",
    edgecolor="k",
)

# bars for VotingClassifier
p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, color="blue", edgecolor="k")
p4 = ax.bar(
    ind + width, [0, 0, 0, class2_1[-1]], width, color="steelblue", edgecolor="k"
)

# plot annotations
plt.axvline(2.8, color="k", linestyle="dashed")
ax.set_xticks(ind + width)
ax.set_xticklabels(
    [
        "LogisticRegression\nweight 1",
        "GaussianNB\nweight 1",
        "RandomForestClassifier\nweight 5",
        "VotingClassifier\n(average probabilities)",
    ],
    rotation=40,
    ha="right",
)
plt.ylim([0, 1])
plt.title("Class probabilities for sample 1 by different classifiers")
plt.legend([p1[0], p2[0]], ["class 1", "class 2"], loc="upper left")
plt.tight_layout()
plt.show()
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`===========================================================`
			`Plot class probabilities calculated by the VotingClassifier`
			`===========================================================`

			`.. currentmodule:: sklearn`

			`Plot the class probabilities of the first sample in a toy dataset predicted by`
			`three different classifiers and averaged by the`
			:class:`~ensemble.VotingClassifier`.

			`First, three exemplary classifiers are initialized`
			(:class:`~linear_model.LogisticRegression`, :class:`~naive_bayes.GaussianNB`,
			and :class:`~ensemble.RandomForestClassifier`) and used to initialize a
			soft-voting :class:`~ensemble.VotingClassifier` with weights `[1, 1, 5]`, which
			`means that the predicted probabilities of the`
			:class:`~ensemble.RandomForestClassifier` count 5 times as much as the weights
			`of the other classifiers when the averaged probability is calculated.`

			`To visualize the probability weighting, we fit each classifier on the training`
			`set and plot the predicted class probabilities for the first sample in this`
			`example dataset.`

			`"""`

			`import matplotlib.pyplot as plt`
			`import numpy as np`

			`from sklearn.ensemble import RandomForestClassifier, VotingClassifier`
			`from sklearn.linear_model import LogisticRegression`
			`from sklearn.naive_bayes import GaussianNB`

			`clf1 = LogisticRegression(max_iter=1000, random_state=123)`
			`clf2 = RandomForestClassifier(n_estimators=100, random_state=123)`
			`clf3 = GaussianNB()`
			`X = np.array([[-1.0, -1.0], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])`
			`y = np.array([1, 1, 2, 2])`

			`eclf = VotingClassifier(`
			`estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],`
			`voting="soft",`
			`weights=[1, 1, 5],`
			`)`

			`# predict class probabilities for all classifiers`
			`probas = [c.fit(X, y).predict_proba(X) for c in (clf1, clf2, clf3, eclf)]`

			`# get class probabilities for the first sample in the dataset`
			`class1_1 = [pr[0, 0] for pr in probas]`
			`class2_1 = [pr[0, 1] for pr in probas]`


			`# plotting`

			`N = 4 # number of groups`
			`ind = np.arange(N) # group positions`
			`width = 0.35 # bar width`

			`fig, ax = plt.subplots()`

			`# bars for classifier 1-3`
			`p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color="green", edgecolor="k")`
			`p2 = ax.bar(`
			`ind + width,`
			`np.hstack(([class2_1[:-1], [0]])),`
			`width,`
			`color="lightgreen",`
			`edgecolor="k",`
			`)`

			`# bars for VotingClassifier`
			`p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, color="blue", edgecolor="k")`
			`p4 = ax.bar(`
			`ind + width, [0, 0, 0, class2_1[-1]], width, color="steelblue", edgecolor="k"`
			`)`

			`# plot annotations`
			`plt.axvline(2.8, color="k", linestyle="dashed")`
			`ax.set_xticks(ind + width)`
			`ax.set_xticklabels(`
			`[`
			`"LogisticRegression\nweight 1",`
			`"GaussianNB\nweight 1",`
			`"RandomForestClassifier\nweight 5",`
			`"VotingClassifier\n(average probabilities)",`
			`],`
			`rotation=40,`
			`ha="right",`
			`)`
			`plt.ylim([0, 1])`
			`plt.title("Class probabilities for sample 1 by different classifiers")`
			`plt.legend([p1[0], p2[0]], ["class 1", "class 2"], loc="upper left")`
			`plt.tight_layout()`
			`plt.show()`