sklearn/examples/miscellaneous/plot_johnson_lindenstrauss_...

r"""
=====================================================================
The Johnson-Lindenstrauss bound for embedding with random projections
=====================================================================


The `Johnson-Lindenstrauss lemma`_ states that any high dimensional
dataset can be randomly projected into a lower dimensional Euclidean
space while controlling the distortion in the pairwise distances.

.. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/\
    Johnson%E2%80%93Lindenstrauss_lemma

"""

import sys
from time import time

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized, load_digits
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.random_projection import (
    SparseRandomProjection,
    johnson_lindenstrauss_min_dim,
)

# %%
# Theoretical bounds
# ==================
# The distortion introduced by a random projection `p` is asserted by
# the fact that `p` is defining an eps-embedding with good probability
# as defined by:
#
# .. math::
#    (1 - eps) \|u - v\|^2 < \|p(u) - p(v)\|^2 < (1 + eps) \|u - v\|^2
#
# Where `u` and `v` are any rows taken from a dataset of shape `(n_samples,
# n_features)` and `p` is a projection by a random Gaussian `N(0, 1)` matrix
# of shape `(n_components, n_features)` (or a sparse Achlioptas matrix).
#
# The minimum number of components to guarantees the eps-embedding is
# given by:
#
# .. math::
#    n\_components \geq 4 log(n\_samples) / (eps^2 / 2 - eps^3 / 3)
#
#
# The first plot shows that with an increasing number of samples ``n_samples``,
# the minimal number of dimensions ``n_components`` increased logarithmically
# in order to guarantee an ``eps``-embedding.

# range of admissible distortions
eps_range = np.linspace(0.1, 0.99, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

# range of number of samples (observation) to embed
n_samples_range = np.logspace(1, 9, 9)

plt.figure()
for eps, color in zip(eps_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
    plt.loglog(n_samples_range, min_n_components, color=color)

plt.legend([f"eps = {eps:0.1f}" for eps in eps_range], loc="lower right")
plt.xlabel("Number of observations to eps-embed")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
plt.show()


# %%
# The second plot shows that an increase of the admissible
# distortion ``eps`` allows to reduce drastically the minimal number of
# dimensions ``n_components`` for a given number of samples ``n_samples``

# range of admissible distortions
eps_range = np.linspace(0.01, 0.99, 100)

# range of number of samples (observation) to embed
n_samples_range = np.logspace(2, 6, 5)
colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

plt.figure()
for n_samples, color in zip(n_samples_range, colors):
    min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
    plt.semilogy(eps_range, min_n_components, color=color)

plt.legend([f"n_samples = {n}" for n in n_samples_range], loc="upper right")
plt.xlabel("Distortion eps")
plt.ylabel("Minimum number of dimensions")
plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
plt.show()

# %%
# Empirical validation
# ====================
#
# We validate the above bounds on the 20 newsgroups text document
# (TF-IDF word frequencies) dataset or on the digits dataset:
#
# - for the 20 newsgroups dataset some 300 documents with 100k
#   features in total are projected using a sparse random matrix to smaller
#   euclidean spaces with various values for the target number of dimensions
#   ``n_components``.
#
# - for the digits dataset, some 8x8 gray level pixels data for 300
#   handwritten digits pictures are randomly projected to spaces for various
#   larger number of dimensions ``n_components``.
#
# The default dataset is the 20 newsgroups dataset. To run the example on the
# digits dataset, pass the ``--use-digits-dataset`` command line argument to
# this script.

if "--use-digits-dataset" in sys.argv:
    data = load_digits().data[:300]
else:
    data = fetch_20newsgroups_vectorized().data[:300]

# %%
# For each value of ``n_components``, we plot:
#
# - 2D distribution of sample pairs with pairwise distances in original
#   and projected spaces as x- and y-axis respectively.
#
# - 1D histogram of the ratio of those distances (projected / original).

n_samples, n_features = data.shape
print(
    f"Embedding {n_samples} samples with dim {n_features} using various "
    "random projections"
)

n_components_range = np.array([300, 1_000, 10_000])
dists = euclidean_distances(data, squared=True).ravel()

# select only non-identical samples pairs
nonzero = dists != 0
dists = dists[nonzero]

for n_components in n_components_range:
    t0 = time()
    rp = SparseRandomProjection(n_components=n_components)
    projected_data = rp.fit_transform(data)
    print(
        f"Projected {n_samples} samples from {n_features} to {n_components} in "
        f"{time() - t0:0.3f}s"
    )
    if hasattr(rp, "components_"):
        n_bytes = rp.components_.data.nbytes
        n_bytes += rp.components_.indices.nbytes
        print(f"Random matrix with size: {n_bytes / 1e6:0.3f} MB")

    projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero]

    plt.figure()
    min_dist = min(projected_dists.min(), dists.min())
    max_dist = max(projected_dists.max(), dists.max())
    plt.hexbin(
        dists,
        projected_dists,
        gridsize=100,
        cmap=plt.cm.PuBu,
        extent=[min_dist, max_dist, min_dist, max_dist],
    )
    plt.xlabel("Pairwise squared distances in original space")
    plt.ylabel("Pairwise squared distances in projected space")
    plt.title("Pairwise distances distribution for n_components=%d" % n_components)
    cb = plt.colorbar()
    cb.set_label("Sample pairs counts")

    rates = projected_dists / dists
    print(f"Mean distances rate: {np.mean(rates):.2f} ({np.std(rates):.2f})")

    plt.figure()
    plt.hist(rates, bins=50, range=(0.0, 2.0), edgecolor="k", density=True)
    plt.xlabel("Squared distances rate: projected / original")
    plt.ylabel("Distribution of samples pairs")
    plt.title("Histogram of pairwise distance rates for n_components=%d" % n_components)

    # TODO: compute the expected value of eps and add them to the previous plot
    # as vertical lines / region

plt.show()


# %%
# We can see that for low values of ``n_components`` the distribution is wide
# with many distorted pairs and a skewed distribution (due to the hard
# limit of zero ratio on the left as distances are always positives)
# while for larger values of `n_components` the distortion is controlled
# and the distances are well preserved by the random projection.
#
# Remarks
# =======
#
# According to the JL lemma, projecting 300 samples without too much distortion
# will require at least several thousands dimensions, irrespective of the
# number of features of the original dataset.
#
# Hence using random projections on the digits dataset which only has 64
# features in the input space does not make sense: it does not allow
# for dimensionality reduction in this case.
#
# On the twenty newsgroups on the other hand the dimensionality can be
# decreased from 56,436 down to 10,000 while reasonably preserving
# pairwise distances.
first commit 2024-08-05 09:32:03 +02:00			`r"""`
			`=====================================================================`
			`The Johnson-Lindenstrauss bound for embedding with random projections`
			`=====================================================================`


			The `Johnson-Lindenstrauss lemma`_ states that any high dimensional
			`dataset can be randomly projected into a lower dimensional Euclidean`
			`space while controlling the distortion in the pairwise distances.`

			.. _`Johnson-Lindenstrauss lemma`: https://en.wikipedia.org/wiki/\
			`Johnson%E2%80%93Lindenstrauss_lemma`

			`"""`

			`import sys`
			`from time import time`

			`import matplotlib.pyplot as plt`
			`import numpy as np`

			`from sklearn.datasets import fetch_20newsgroups_vectorized, load_digits`
			`from sklearn.metrics.pairwise import euclidean_distances`
			`from sklearn.random_projection import (`
			`SparseRandomProjection,`
			`johnson_lindenstrauss_min_dim,`
			`)`

			`# %%`
			`# Theoretical bounds`
			`# ==================`
			# The distortion introduced by a random projection `p` is asserted by
			# the fact that `p` is defining an eps-embedding with good probability
			`# as defined by:`
			`#`
			`# .. math::`
			`# (1 - eps) \\|u - v\\|^2 < \\|p(u) - p(v)\\|^2 < (1 + eps) \\|u - v\\|^2`
			`#`
			# Where `u` and `v` are any rows taken from a dataset of shape `(n_samples,
			# n_features)` and `p` is a projection by a random Gaussian `N(0, 1)` matrix
			# of shape `(n_components, n_features)` (or a sparse Achlioptas matrix).
			`#`
			`# The minimum number of components to guarantees the eps-embedding is`
			`# given by:`
			`#`
			`# .. math::`
			`# n\_components \geq 4 log(n\_samples) / (eps^2 / 2 - eps^3 / 3)`
			`#`
			`#`
			# The first plot shows that with an increasing number of samples ``n_samples``,
			# the minimal number of dimensions ``n_components`` increased logarithmically
			# in order to guarantee an ``eps``-embedding.

			`# range of admissible distortions`
			`eps_range = np.linspace(0.1, 0.99, 5)`
			`colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))`

			`# range of number of samples (observation) to embed`
			`n_samples_range = np.logspace(1, 9, 9)`

			`plt.figure()`
			`for eps, color in zip(eps_range, colors):`
			`min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)`
			`plt.loglog(n_samples_range, min_n_components, color=color)`

			`plt.legend([f"eps = {eps:0.1f}" for eps in eps_range], loc="lower right")`
			`plt.xlabel("Number of observations to eps-embed")`
			`plt.ylabel("Minimum number of dimensions")`
			`plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")`
			`plt.show()`


			`# %%`
			`# The second plot shows that an increase of the admissible`
			# distortion ``eps`` allows to reduce drastically the minimal number of
			# dimensions ``n_components`` for a given number of samples ``n_samples``

			`# range of admissible distortions`
			`eps_range = np.linspace(0.01, 0.99, 100)`

			`# range of number of samples (observation) to embed`
			`n_samples_range = np.logspace(2, 6, 5)`
			`colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))`

			`plt.figure()`
			`for n_samples, color in zip(n_samples_range, colors):`
			`min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)`
			`plt.semilogy(eps_range, min_n_components, color=color)`

			`plt.legend([f"n_samples = {n}" for n in n_samples_range], loc="upper right")`
			`plt.xlabel("Distortion eps")`
			`plt.ylabel("Minimum number of dimensions")`
			`plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")`
			`plt.show()`

			`# %%`
			`# Empirical validation`
			`# ====================`
			`#`
			`# We validate the above bounds on the 20 newsgroups text document`
			`# (TF-IDF word frequencies) dataset or on the digits dataset:`
			`#`
			`# - for the 20 newsgroups dataset some 300 documents with 100k`
			`# features in total are projected using a sparse random matrix to smaller`
			`# euclidean spaces with various values for the target number of dimensions`
			# ``n_components``.
			`#`
			`# - for the digits dataset, some 8x8 gray level pixels data for 300`
			`# handwritten digits pictures are randomly projected to spaces for various`
			# larger number of dimensions ``n_components``.
			`#`
			`# The default dataset is the 20 newsgroups dataset. To run the example on the`
			# digits dataset, pass the ``--use-digits-dataset`` command line argument to
			`# this script.`

			`if "--use-digits-dataset" in sys.argv:`
			`data = load_digits().data[:300]`
			`else:`
			`data = fetch_20newsgroups_vectorized().data[:300]`

			`# %%`
			# For each value of ``n_components``, we plot:
			`#`
			`# - 2D distribution of sample pairs with pairwise distances in original`
			`# and projected spaces as x- and y-axis respectively.`
			`#`
			`# - 1D histogram of the ratio of those distances (projected / original).`

			`n_samples, n_features = data.shape`
			`print(`
			`f"Embedding {n_samples} samples with dim {n_features} using various "`
			`"random projections"`
			`)`

			`n_components_range = np.array([300, 1_000, 10_000])`
			`dists = euclidean_distances(data, squared=True).ravel()`

			`# select only non-identical samples pairs`
			`nonzero = dists != 0`
			`dists = dists[nonzero]`

			`for n_components in n_components_range:`
			`t0 = time()`
			`rp = SparseRandomProjection(n_components=n_components)`
			`projected_data = rp.fit_transform(data)`
			`print(`
			`f"Projected {n_samples} samples from {n_features} to {n_components} in "`
			`f"{time() - t0:0.3f}s"`
			`)`
			`if hasattr(rp, "components_"):`
			`n_bytes = rp.components_.data.nbytes`
			`n_bytes += rp.components_.indices.nbytes`
			`print(f"Random matrix with size: {n_bytes / 1e6:0.3f} MB")`

			`projected_dists = euclidean_distances(projected_data, squared=True).ravel()[nonzero]`

			`plt.figure()`
			`min_dist = min(projected_dists.min(), dists.min())`
			`max_dist = max(projected_dists.max(), dists.max())`
			`plt.hexbin(`
			`dists,`
			`projected_dists,`
			`gridsize=100,`
			`cmap=plt.cm.PuBu,`
			`extent=[min_dist, max_dist, min_dist, max_dist],`
			`)`
			`plt.xlabel("Pairwise squared distances in original space")`
			`plt.ylabel("Pairwise squared distances in projected space")`
			`plt.title("Pairwise distances distribution for n_components=%d" % n_components)`
			`cb = plt.colorbar()`
			`cb.set_label("Sample pairs counts")`

			`rates = projected_dists / dists`
			`print(f"Mean distances rate: {np.mean(rates):.2f} ({np.std(rates):.2f})")`

			`plt.figure()`
			`plt.hist(rates, bins=50, range=(0.0, 2.0), edgecolor="k", density=True)`
			`plt.xlabel("Squared distances rate: projected / original")`
			`plt.ylabel("Distribution of samples pairs")`
			`plt.title("Histogram of pairwise distance rates for n_components=%d" % n_components)`

			`# TODO: compute the expected value of eps and add them to the previous plot`
			`# as vertical lines / region`

			`plt.show()`


			`# %%`
			# We can see that for low values of ``n_components`` the distribution is wide
			`# with many distorted pairs and a skewed distribution (due to the hard`
			`# limit of zero ratio on the left as distances are always positives)`
			# while for larger values of `n_components` the distortion is controlled
			`# and the distances are well preserved by the random projection.`
			`#`
			`# Remarks`
			`# =======`
			`#`
			`# According to the JL lemma, projecting 300 samples without too much distortion`
			`# will require at least several thousands dimensions, irrespective of the`
			`# number of features of the original dataset.`
			`#`
			`# Hence using random projections on the digits dataset which only has 64`
			`# features in the input space does not make sense: it does not allow`
			`# for dimensionality reduction in this case.`
			`#`
			`# On the twenty newsgroups on the other hand the dimensionality can be`
			`# decreased from 56,436 down to 10,000 while reasonably preserving`
			`# pairwise distances.`