sklearn/examples/text/plot_hashing_vs_dict_vector...

"""
===========================================
FeatureHasher and DictVectorizer Comparison
===========================================

In this example we illustrate text vectorization, which is the process of
representing non-numerical input data (such as dictionaries or text documents)
as vectors of real numbers.

We first compare :func:`~sklearn.feature_extraction.FeatureHasher` and
:func:`~sklearn.feature_extraction.DictVectorizer` by using both methods to
vectorize text documents that are preprocessed (tokenized) with the help of a
custom Python function.

Later we introduce and analyze the text-specific vectorizers
:func:`~sklearn.feature_extraction.text.HashingVectorizer`,
:func:`~sklearn.feature_extraction.text.CountVectorizer` and
:func:`~sklearn.feature_extraction.text.TfidfVectorizer` that handle both the
tokenization and the assembling of the feature matrix within a single class.

The objective of the example is to demonstrate the usage of text vectorization
API and to compare their processing time. See the example scripts
:ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
and :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` for actual
learning on text documents.

"""

# Author: Lars Buitinck
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Arturo Amor <david-arturo.amor-quiroz@inria.fr>
# License: BSD 3 clause

# %%
# Load Data
# ---------
#
# We load data from :ref:`20newsgroups_dataset`, which comprises around
# 18000 newsgroups posts on 20 topics split in two subsets: one for training and
# one for testing. For the sake of simplicity and reducing the computational
# cost, we select a subset of 7 topics and use the training set only.

from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.sys.ibm.pc.hardware",
    "misc.forsale",
    "rec.autos",
    "sci.space",
    "talk.religion.misc",
]

print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")

# %%
# Define preprocessing functions
# ------------------------------
#
# A token may be a word, part of a word or anything comprised between spaces or
# symbols in a string. Here we define a function that extracts the tokens using
# a simple regular expression (regex) that matches Unicode word characters. This
# includes most characters that can be part of a word in any language, as well
# as numbers and the underscore:

import re


def tokenize(doc):
    """Extract tokens from doc.

    This uses a simple regex that matches word characters to break strings
    into tokens. For a more principled approach, see CountVectorizer or
    TfidfVectorizer.
    """
    return (tok.lower() for tok in re.findall(r"\w+", doc))


list(tokenize("This is a simple example, isn't it?"))

# %%
# We define an additional function that counts the (frequency of) occurrence of
# each token in a given document. It returns a frequency dictionary to be used
# by the vectorizers.

from collections import defaultdict


def token_freqs(doc):
    """Extract a dict mapping tokens from doc to their occurrences."""

    freq = defaultdict(int)
    for tok in tokenize(doc):
        freq[tok] += 1
    return freq


token_freqs("That is one example, but this is another one")

# %%
# Observe in particular that the repeated token `"is"` is counted twice for
# instance.
#
# Breaking a text document into word tokens, potentially losing the order
# information between the words in a sentence is often called a `Bag of Words
# representation <https://en.wikipedia.org/wiki/Bag-of-words_model>`_.

# %%
# DictVectorizer
# --------------
#
# First we benchmark the :func:`~sklearn.feature_extraction.DictVectorizer`,
# then we compare it to :func:`~sklearn.feature_extraction.FeatureHasher` as
# both of them receive dictionaries as input.

from time import time

from sklearn.feature_extraction import DictVectorizer

dict_count_vectorizers = defaultdict(list)

t0 = time()
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
    vectorizer.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

# %%
# The actual mapping from text token to column index is explicitly stored in
# the `.vocabulary_` attribute which is a potentially very large Python
# dictionary:
type(vectorizer.vocabulary_)

# %%
len(vectorizer.vocabulary_)

# %%
vectorizer.vocabulary_["example"]

# %%
# FeatureHasher
# -------------
#
# Dictionaries take up a large amount of storage space and grow in size as the
# training set grows. Instead of growing the vectors along with a dictionary,
# feature hashing builds a vector of pre-defined length by applying a hash
# function `h` to the features (e.g., tokens), then using the hash values
# directly as feature indices and updating the resulting vector at those
# indices. When the feature space is not large enough, hashing functions tend to
# map distinct values to the same hash code (hash collisions). As a result, it
# is impossible to determine what object generated any particular hash code.
#
# Because of the above it is impossible to recover the original tokens from the
# feature matrix and the best approach to estimate the number of unique terms in
# the original dictionary is to count the number of active columns in the
# encoded feature matrix. For such a purpose we define the following function:

import numpy as np


def n_nonzero_columns(X):
    """Number of columns with at least one non-zero value in a CSR matrix.

    This is useful to count the number of features columns that are effectively
    active when using the FeatureHasher.
    """
    return len(np.unique(X.nonzero()[1]))


# %%
# The default number of features for the
# :func:`~sklearn.feature_extraction.FeatureHasher` is 2**20. Here we set
# `n_features = 2**18` to illustrate hash collisions.
#
# **FeatureHasher on frequency dictionaries**

from sklearn.feature_extraction import FeatureHasher

t0 = time()
hasher = FeatureHasher(n_features=2**18)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
    hasher.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")

# %%
# The number of unique tokens when using the
# :func:`~sklearn.feature_extraction.FeatureHasher` is lower than those obtained
# using the :func:`~sklearn.feature_extraction.DictVectorizer`. This is due to
# hash collisions.
#
# The number of collisions can be reduced by increasing the feature space.
# Notice that the speed of the vectorizer does not change significantly when
# setting a large number of features, though it causes larger coefficient
# dimensions and then requires more memory usage to store them, even if a
# majority of them is inactive.

t0 = time()
hasher = FeatureHasher(n_features=2**22)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0

print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")

# %%
# We confirm that the number of unique tokens gets closer to the number of
# unique terms found by the :func:`~sklearn.feature_extraction.DictVectorizer`.
#
# **FeatureHasher on raw tokens**
#
# Alternatively, one can set `input_type="string"` in the
# :func:`~sklearn.feature_extraction.FeatureHasher` to vectorize the strings
# output directly from the customized `tokenize` function. This is equivalent to
# passing a dictionary with an implied frequency of 1 for each feature name.

t0 = time()
hasher = FeatureHasher(n_features=2**18, input_type="string")
X = hasher.transform(tokenize(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
    hasher.__class__.__name__ + "\non raw tokens"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")

# %%
# We now plot the speed of the above methods for vectorizing.

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 6))

y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))
ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels(dict_count_vectorizers["vectorizer"])
ax.invert_yaxis()
_ = ax.set_xlabel("speed (MB/s)")

# %%
# In both cases :func:`~sklearn.feature_extraction.FeatureHasher` is
# approximately twice as fast as
# :func:`~sklearn.feature_extraction.DictVectorizer`. This is handy when dealing
# with large amounts of data, with the downside of losing the invertibility of
# the transformation, which in turn makes the interpretation of a model a more
# complex task.
#
# The `FeatureHeasher` with `input_type="string"` is slightly faster than the
# variant that works on frequency dict because it does not count repeated
# tokens: each token is implicitly counted once, even if it was repeated.
# Depending on the downstream machine learning task, it can be a limitation or
# not.
#
# Comparison with special purpose text vectorizers
# ------------------------------------------------
#
# :func:`~sklearn.feature_extraction.text.CountVectorizer` accepts raw data as
# it internally implements tokenization and occurrence counting. It is similar
# to the :func:`~sklearn.feature_extraction.DictVectorizer` when used along with
# the customized function `token_freqs` as done in the previous section. The
# difference being that :func:`~sklearn.feature_extraction.text.CountVectorizer`
# is more flexible. In particular it accepts various regex patterns through the
# `token_pattern` parameter.

from sklearn.feature_extraction.text import CountVectorizer

t0 = time()
vectorizer = CountVectorizer()
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

# %%
# We see that using the :func:`~sklearn.feature_extraction.text.CountVectorizer`
# implementation is approximately twice as fast as using the
# :func:`~sklearn.feature_extraction.DictVectorizer` along with the simple
# function we defined for mapping the tokens. The reason is that
# :func:`~sklearn.feature_extraction.text.CountVectorizer` is optimized by
# reusing a compiled regular expression for the full training set instead of
# creating one per document as done in our naive tokenize function.
#
# Now we make a similar experiment with the
# :func:`~sklearn.feature_extraction.text.HashingVectorizer`, which is
# equivalent to combining the "hashing trick" implemented by the
# :func:`~sklearn.feature_extraction.FeatureHasher` class and the text
# preprocessing and tokenization of the
# :func:`~sklearn.feature_extraction.text.CountVectorizer`.

from sklearn.feature_extraction.text import HashingVectorizer

t0 = time()
vectorizer = HashingVectorizer(n_features=2**18)
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")

# %%
# We can observe that this is the fastest text tokenization strategy so far,
# assuming that the downstream machine learning task can tolerate a few
# collisions.
#
# TfidfVectorizer
# ---------------
#
# In a large text corpus, some words appear with higher frequency (e.g. "the",
# "a", "is" in English) and do not carry meaningful information about the actual
# contents of a document. If we were to feed the word count data directly to a
# classifier, those very common terms would shadow the frequencies of rarer yet
# more informative terms. In order to re-weight the count features into floating
# point values suitable for usage by a classifier it is very common to use the
# tf-idf transform as implemented by the
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`. TF stands for
# "term-frequency" while "tf-idf" means term-frequency times inverse
# document-frequency.
#
# We now benchmark the :func:`~sklearn.feature_extraction.text.TfidfVectorizer`,
# which is equivalent to combining the tokenization and occurrence counting of
# the :func:`~sklearn.feature_extraction.text.CountVectorizer` along with the
# normalizing and weighting from a
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.

from sklearn.feature_extraction.text import TfidfVectorizer

t0 = time()
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

# %%
# Summary
# -------
# Let's conclude this notebook by summarizing all the recorded processing speeds
# in a single plot:

fig, ax = plt.subplots(figsize=(12, 6))

y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))
ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels(dict_count_vectorizers["vectorizer"])
ax.invert_yaxis()
_ = ax.set_xlabel("speed (MB/s)")

# %%
# Notice from the plot that
# :func:`~sklearn.feature_extraction.text.TfidfVectorizer` is slightly slower
# than :func:`~sklearn.feature_extraction.text.CountVectorizer` because of the
# extra operation induced by the
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.
#
# Also notice that, by setting the number of features `n_features = 2**18`, the
# :func:`~sklearn.feature_extraction.text.HashingVectorizer` performs better
# than the :func:`~sklearn.feature_extraction.text.CountVectorizer` at the
# expense of inversibility of the transformation due to hash collisions.
#
# We highlight that :func:`~sklearn.feature_extraction.text.CountVectorizer` and
# :func:`~sklearn.feature_extraction.text.HashingVectorizer` perform better than
# their equivalent :func:`~sklearn.feature_extraction.DictVectorizer` and
# :func:`~sklearn.feature_extraction.FeatureHasher` on manually tokenized
# documents since the internal tokenization step of the former vectorizers
# compiles a regular expression once and then reuses it for all the documents.