110 lines
3.8 KiB
Python
110 lines
3.8 KiB
Python
|
"""
|
||
|
=================================
|
||
|
Compare BIRCH and MiniBatchKMeans
|
||
|
=================================
|
||
|
|
||
|
This example compares the timing of BIRCH (with and without the global
|
||
|
clustering step) and MiniBatchKMeans on a synthetic dataset having
|
||
|
25,000 samples and 2 features generated using make_blobs.
|
||
|
|
||
|
Both ``MiniBatchKMeans`` and ``BIRCH`` are very scalable algorithms and could
|
||
|
run efficiently on hundreds of thousands or even millions of datapoints. We
|
||
|
chose to limit the dataset size of this example in the interest of keeping
|
||
|
our Continuous Integration resource usage reasonable but the interested
|
||
|
reader might enjoy editing this script to rerun it with a larger value for
|
||
|
`n_samples`.
|
||
|
|
||
|
If ``n_clusters`` is set to None, the data is reduced from 25,000
|
||
|
samples to a set of 158 clusters. This can be viewed as a preprocessing
|
||
|
step before the final (global) clustering step that further reduces these
|
||
|
158 clusters to 100 clusters.
|
||
|
|
||
|
"""
|
||
|
|
||
|
# Authors: Manoj Kumar <manojkumarsivaraj334@gmail.com
|
||
|
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
from itertools import cycle
|
||
|
from time import time
|
||
|
|
||
|
import matplotlib.colors as colors
|
||
|
import matplotlib.pyplot as plt
|
||
|
import numpy as np
|
||
|
from joblib import cpu_count
|
||
|
|
||
|
from sklearn.cluster import Birch, MiniBatchKMeans
|
||
|
from sklearn.datasets import make_blobs
|
||
|
|
||
|
# Generate centers for the blobs so that it forms a 10 X 10 grid.
|
||
|
xx = np.linspace(-22, 22, 10)
|
||
|
yy = np.linspace(-22, 22, 10)
|
||
|
xx, yy = np.meshgrid(xx, yy)
|
||
|
n_centers = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis]))
|
||
|
|
||
|
# Generate blobs to do a comparison between MiniBatchKMeans and BIRCH.
|
||
|
X, y = make_blobs(n_samples=25000, centers=n_centers, random_state=0)
|
||
|
|
||
|
# Use all colors that matplotlib provides by default.
|
||
|
colors_ = cycle(colors.cnames.keys())
|
||
|
|
||
|
fig = plt.figure(figsize=(12, 4))
|
||
|
fig.subplots_adjust(left=0.04, right=0.98, bottom=0.1, top=0.9)
|
||
|
|
||
|
# Compute clustering with BIRCH with and without the final clustering step
|
||
|
# and plot.
|
||
|
birch_models = [
|
||
|
Birch(threshold=1.7, n_clusters=None),
|
||
|
Birch(threshold=1.7, n_clusters=100),
|
||
|
]
|
||
|
final_step = ["without global clustering", "with global clustering"]
|
||
|
|
||
|
for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)):
|
||
|
t = time()
|
||
|
birch_model.fit(X)
|
||
|
print("BIRCH %s as the final step took %0.2f seconds" % (info, (time() - t)))
|
||
|
|
||
|
# Plot result
|
||
|
labels = birch_model.labels_
|
||
|
centroids = birch_model.subcluster_centers_
|
||
|
n_clusters = np.unique(labels).size
|
||
|
print("n_clusters : %d" % n_clusters)
|
||
|
|
||
|
ax = fig.add_subplot(1, 3, ind + 1)
|
||
|
for this_centroid, k, col in zip(centroids, range(n_clusters), colors_):
|
||
|
mask = labels == k
|
||
|
ax.scatter(X[mask, 0], X[mask, 1], c="w", edgecolor=col, marker=".", alpha=0.5)
|
||
|
if birch_model.n_clusters is None:
|
||
|
ax.scatter(this_centroid[0], this_centroid[1], marker="+", c="k", s=25)
|
||
|
ax.set_ylim([-25, 25])
|
||
|
ax.set_xlim([-25, 25])
|
||
|
ax.set_autoscaley_on(False)
|
||
|
ax.set_title("BIRCH %s" % info)
|
||
|
|
||
|
# Compute clustering with MiniBatchKMeans.
|
||
|
mbk = MiniBatchKMeans(
|
||
|
init="k-means++",
|
||
|
n_clusters=100,
|
||
|
batch_size=256 * cpu_count(),
|
||
|
n_init=10,
|
||
|
max_no_improvement=10,
|
||
|
verbose=0,
|
||
|
random_state=0,
|
||
|
)
|
||
|
t0 = time()
|
||
|
mbk.fit(X)
|
||
|
t_mini_batch = time() - t0
|
||
|
print("Time taken to run MiniBatchKMeans %0.2f seconds" % t_mini_batch)
|
||
|
mbk_means_labels_unique = np.unique(mbk.labels_)
|
||
|
|
||
|
ax = fig.add_subplot(1, 3, 3)
|
||
|
for this_centroid, k, col in zip(mbk.cluster_centers_, range(n_clusters), colors_):
|
||
|
mask = mbk.labels_ == k
|
||
|
ax.scatter(X[mask, 0], X[mask, 1], marker=".", c="w", edgecolor=col, alpha=0.5)
|
||
|
ax.scatter(this_centroid[0], this_centroid[1], marker="+", c="k", s=25)
|
||
|
ax.set_xlim([-25, 25])
|
||
|
ax.set_ylim([-25, 25])
|
||
|
ax.set_title("MiniBatchKMeans")
|
||
|
ax.set_autoscaley_on(False)
|
||
|
plt.show()
|