sklearn/examples/text/plot_hashing_vs_dict_vector...

"""
===========================================
FeatureHasher and DictVectorizer Comparison
===========================================

In this example we illustrate text vectorization, which is the process of
representing non-numerical input data (such as dictionaries or text documents)
as vectors of real numbers.

We first compare :func:`~sklearn.feature_extraction.FeatureHasher` and
:func:`~sklearn.feature_extraction.DictVectorizer` by using both methods to
vectorize text documents that are preprocessed (tokenized) with the help of a
custom Python function.

Later we introduce and analyze the text-specific vectorizers
:func:`~sklearn.feature_extraction.text.HashingVectorizer`,
:func:`~sklearn.feature_extraction.text.CountVectorizer` and
:func:`~sklearn.feature_extraction.text.TfidfVectorizer` that handle both the
tokenization and the assembling of the feature matrix within a single class.

The objective of the example is to demonstrate the usage of text vectorization
API and to compare their processing time. See the example scripts
:ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
and :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` for actual
learning on text documents.

"""

# Author: Lars Buitinck
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Arturo Amor <david-arturo.amor-quiroz@inria.fr>
# License: BSD 3 clause

# %%
# Load Data
# ---------
#
# We load data from :ref:`20newsgroups_dataset`, which comprises around
# 18000 newsgroups posts on 20 topics split in two subsets: one for training and
# one for testing. For the sake of simplicity and reducing the computational
# cost, we select a subset of 7 topics and use the training set only.

from sklearn.datasets import fetch_20newsgroups

categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.sys.ibm.pc.hardware",
    "misc.forsale",
    "rec.autos",
    "sci.space",
    "talk.religion.misc",
]

print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")

# %%
# Define preprocessing functions
# ------------------------------
#
# A token may be a word, part of a word or anything comprised between spaces or
# symbols in a string. Here we define a function that extracts the tokens using
# a simple regular expression (regex) that matches Unicode word characters. This
# includes most characters that can be part of a word in any language, as well
# as numbers and the underscore:

import re


def tokenize(doc):
    """Extract tokens from doc.

    This uses a simple regex that matches word characters to break strings
    into tokens. For a more principled approach, see CountVectorizer or
    TfidfVectorizer.
    """
    return (tok.lower() for tok in re.findall(r"\w+", doc))


list(tokenize("This is a simple example, isn't it?"))

# %%
# We define an additional function that counts the (frequency of) occurrence of
# each token in a given document. It returns a frequency dictionary to be used
# by the vectorizers.

from collections import defaultdict


def token_freqs(doc):
    """Extract a dict mapping tokens from doc to their occurrences."""

    freq = defaultdict(int)
    for tok in tokenize(doc):
        freq[tok] += 1
    return freq


token_freqs("That is one example, but this is another one")

# %%
# Observe in particular that the repeated token `"is"` is counted twice for
# instance.
#
# Breaking a text document into word tokens, potentially losing the order
# information between the words in a sentence is often called a `Bag of Words
# representation <https://en.wikipedia.org/wiki/Bag-of-words_model>`_.

# %%
# DictVectorizer
# --------------
#
# First we benchmark the :func:`~sklearn.feature_extraction.DictVectorizer`,
# then we compare it to :func:`~sklearn.feature_extraction.FeatureHasher` as
# both of them receive dictionaries as input.

from time import time

from sklearn.feature_extraction import DictVectorizer

dict_count_vectorizers = defaultdict(list)

t0 = time()
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
    vectorizer.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

# %%
# The actual mapping from text token to column index is explicitly stored in
# the `.vocabulary_` attribute which is a potentially very large Python
# dictionary:
type(vectorizer.vocabulary_)

# %%
len(vectorizer.vocabulary_)

# %%
vectorizer.vocabulary_["example"]

# %%
# FeatureHasher
# -------------
#
# Dictionaries take up a large amount of storage space and grow in size as the
# training set grows. Instead of growing the vectors along with a dictionary,
# feature hashing builds a vector of pre-defined length by applying a hash
# function `h` to the features (e.g., tokens), then using the hash values
# directly as feature indices and updating the resulting vector at those
# indices. When the feature space is not large enough, hashing functions tend to
# map distinct values to the same hash code (hash collisions). As a result, it
# is impossible to determine what object generated any particular hash code.
#
# Because of the above it is impossible to recover the original tokens from the
# feature matrix and the best approach to estimate the number of unique terms in
# the original dictionary is to count the number of active columns in the
# encoded feature matrix. For such a purpose we define the following function:

import numpy as np


def n_nonzero_columns(X):
    """Number of columns with at least one non-zero value in a CSR matrix.

    This is useful to count the number of features columns that are effectively
    active when using the FeatureHasher.
    """
    return len(np.unique(X.nonzero()[1]))


# %%
# The default number of features for the
# :func:`~sklearn.feature_extraction.FeatureHasher` is 2**20. Here we set
# `n_features = 2**18` to illustrate hash collisions.
#
# **FeatureHasher on frequency dictionaries**

from sklearn.feature_extraction import FeatureHasher

t0 = time()
hasher = FeatureHasher(n_features=2**18)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
    hasher.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")

# %%
# The number of unique tokens when using the
# :func:`~sklearn.feature_extraction.FeatureHasher` is lower than those obtained
# using the :func:`~sklearn.feature_extraction.DictVectorizer`. This is due to
# hash collisions.
#
# The number of collisions can be reduced by increasing the feature space.
# Notice that the speed of the vectorizer does not change significantly when
# setting a large number of features, though it causes larger coefficient
# dimensions and then requires more memory usage to store them, even if a
# majority of them is inactive.

t0 = time()
hasher = FeatureHasher(n_features=2**22)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0

print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")

# %%
# We confirm that the number of unique tokens gets closer to the number of
# unique terms found by the :func:`~sklearn.feature_extraction.DictVectorizer`.
#
# **FeatureHasher on raw tokens**
#
# Alternatively, one can set `input_type="string"` in the
# :func:`~sklearn.feature_extraction.FeatureHasher` to vectorize the strings
# output directly from the customized `tokenize` function. This is equivalent to
# passing a dictionary with an implied frequency of 1 for each feature name.

t0 = time()
hasher = FeatureHasher(n_features=2**18, input_type="string")
X = hasher.transform(tokenize(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
    hasher.__class__.__name__ + "\non raw tokens"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")

# %%
# We now plot the speed of the above methods for vectorizing.

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 6))

y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))
ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels(dict_count_vectorizers["vectorizer"])
ax.invert_yaxis()
_ = ax.set_xlabel("speed (MB/s)")

# %%
# In both cases :func:`~sklearn.feature_extraction.FeatureHasher` is
# approximately twice as fast as
# :func:`~sklearn.feature_extraction.DictVectorizer`. This is handy when dealing
# with large amounts of data, with the downside of losing the invertibility of
# the transformation, which in turn makes the interpretation of a model a more
# complex task.
#
# The `FeatureHeasher` with `input_type="string"` is slightly faster than the
# variant that works on frequency dict because it does not count repeated
# tokens: each token is implicitly counted once, even if it was repeated.
# Depending on the downstream machine learning task, it can be a limitation or
# not.
#
# Comparison with special purpose text vectorizers
# ------------------------------------------------
#
# :func:`~sklearn.feature_extraction.text.CountVectorizer` accepts raw data as
# it internally implements tokenization and occurrence counting. It is similar
# to the :func:`~sklearn.feature_extraction.DictVectorizer` when used along with
# the customized function `token_freqs` as done in the previous section. The
# difference being that :func:`~sklearn.feature_extraction.text.CountVectorizer`
# is more flexible. In particular it accepts various regex patterns through the
# `token_pattern` parameter.

from sklearn.feature_extraction.text import CountVectorizer

t0 = time()
vectorizer = CountVectorizer()
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

# %%
# We see that using the :func:`~sklearn.feature_extraction.text.CountVectorizer`
# implementation is approximately twice as fast as using the
# :func:`~sklearn.feature_extraction.DictVectorizer` along with the simple
# function we defined for mapping the tokens. The reason is that
# :func:`~sklearn.feature_extraction.text.CountVectorizer` is optimized by
# reusing a compiled regular expression for the full training set instead of
# creating one per document as done in our naive tokenize function.
#
# Now we make a similar experiment with the
# :func:`~sklearn.feature_extraction.text.HashingVectorizer`, which is
# equivalent to combining the "hashing trick" implemented by the
# :func:`~sklearn.feature_extraction.FeatureHasher` class and the text
# preprocessing and tokenization of the
# :func:`~sklearn.feature_extraction.text.CountVectorizer`.

from sklearn.feature_extraction.text import HashingVectorizer

t0 = time()
vectorizer = HashingVectorizer(n_features=2**18)
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")

# %%
# We can observe that this is the fastest text tokenization strategy so far,
# assuming that the downstream machine learning task can tolerate a few
# collisions.
#
# TfidfVectorizer
# ---------------
#
# In a large text corpus, some words appear with higher frequency (e.g. "the",
# "a", "is" in English) and do not carry meaningful information about the actual
# contents of a document. If we were to feed the word count data directly to a
# classifier, those very common terms would shadow the frequencies of rarer yet
# more informative terms. In order to re-weight the count features into floating
# point values suitable for usage by a classifier it is very common to use the
# tf-idf transform as implemented by the
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`. TF stands for
# "term-frequency" while "tf-idf" means term-frequency times inverse
# document-frequency.
#
# We now benchmark the :func:`~sklearn.feature_extraction.text.TfidfVectorizer`,
# which is equivalent to combining the tokenization and occurrence counting of
# the :func:`~sklearn.feature_extraction.text.CountVectorizer` along with the
# normalizing and weighting from a
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.

from sklearn.feature_extraction.text import TfidfVectorizer

t0 = time()
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

# %%
# Summary
# -------
# Let's conclude this notebook by summarizing all the recorded processing speeds
# in a single plot:

fig, ax = plt.subplots(figsize=(12, 6))

y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))
ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels(dict_count_vectorizers["vectorizer"])
ax.invert_yaxis()
_ = ax.set_xlabel("speed (MB/s)")

# %%
# Notice from the plot that
# :func:`~sklearn.feature_extraction.text.TfidfVectorizer` is slightly slower
# than :func:`~sklearn.feature_extraction.text.CountVectorizer` because of the
# extra operation induced by the
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.
#
# Also notice that, by setting the number of features `n_features = 2**18`, the
# :func:`~sklearn.feature_extraction.text.HashingVectorizer` performs better
# than the :func:`~sklearn.feature_extraction.text.CountVectorizer` at the
# expense of inversibility of the transformation due to hash collisions.
#
# We highlight that :func:`~sklearn.feature_extraction.text.CountVectorizer` and
# :func:`~sklearn.feature_extraction.text.HashingVectorizer` perform better than
# their equivalent :func:`~sklearn.feature_extraction.DictVectorizer` and
# :func:`~sklearn.feature_extraction.FeatureHasher` on manually tokenized
# documents since the internal tokenization step of the former vectorizers
# compiles a regular expression once and then reuses it for all the documents.
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`===========================================`
			`FeatureHasher and DictVectorizer Comparison`
			`===========================================`

			`In this example we illustrate text vectorization, which is the process of`
			`representing non-numerical input data (such as dictionaries or text documents)`
			`as vectors of real numbers.`

			We first compare :func:`~sklearn.feature_extraction.FeatureHasher` and
			:func:`~sklearn.feature_extraction.DictVectorizer` by using both methods to
			`vectorize text documents that are preprocessed (tokenized) with the help of a`
			`custom Python function.`

			`Later we introduce and analyze the text-specific vectorizers`
			:func:`~sklearn.feature_extraction.text.HashingVectorizer`,
			:func:`~sklearn.feature_extraction.text.CountVectorizer` and
			:func:`~sklearn.feature_extraction.text.TfidfVectorizer` that handle both the
			`tokenization and the assembling of the feature matrix within a single class.`

			`The objective of the example is to demonstrate the usage of text vectorization`
			`API and to compare their processing time. See the example scripts`
			:ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
			and :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` for actual
			`learning on text documents.`

			`"""`

			`# Author: Lars Buitinck`
			`# Olivier Grisel <olivier.grisel@ensta.org>`
			`# Arturo Amor <david-arturo.amor-quiroz@inria.fr>`
			`# License: BSD 3 clause`

			`# %%`
			`# Load Data`
			`# ---------`
			`#`
			# We load data from :ref:`20newsgroups_dataset`, which comprises around
			`# 18000 newsgroups posts on 20 topics split in two subsets: one for training and`
			`# one for testing. For the sake of simplicity and reducing the computational`
			`# cost, we select a subset of 7 topics and use the training set only.`

			`from sklearn.datasets import fetch_20newsgroups`

			`categories = [`
			`"alt.atheism",`
			`"comp.graphics",`
			`"comp.sys.ibm.pc.hardware",`
			`"misc.forsale",`
			`"rec.autos",`
			`"sci.space",`
			`"talk.religion.misc",`
			`]`

			`print("Loading 20 newsgroups training data")`
			`raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)`
			`data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6`
			`print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")`

			`# %%`
			`# Define preprocessing functions`
			`# ------------------------------`
			`#`
			`# A token may be a word, part of a word or anything comprised between spaces or`
			`# symbols in a string. Here we define a function that extracts the tokens using`
			`# a simple regular expression (regex) that matches Unicode word characters. This`
			`# includes most characters that can be part of a word in any language, as well`
			`# as numbers and the underscore:`

			`import re`


			`def tokenize(doc):`
			`"""Extract tokens from doc.`

			`This uses a simple regex that matches word characters to break strings`
			`into tokens. For a more principled approach, see CountVectorizer or`
			`TfidfVectorizer.`
			`"""`
			`return (tok.lower() for tok in re.findall(r"\w+", doc))`


			`list(tokenize("This is a simple example, isn't it?"))`

			`# %%`
			`# We define an additional function that counts the (frequency of) occurrence of`
			`# each token in a given document. It returns a frequency dictionary to be used`
			`# by the vectorizers.`

			`from collections import defaultdict`


			`def token_freqs(doc):`
			`"""Extract a dict mapping tokens from doc to their occurrences."""`

			`freq = defaultdict(int)`
			`for tok in tokenize(doc):`
			`freq[tok] += 1`
			`return freq`


			`token_freqs("That is one example, but this is another one")`

			`# %%`
			# Observe in particular that the repeated token `"is"` is counted twice for
			`# instance.`
			`#`
			`# Breaking a text document into word tokens, potentially losing the order`
			# information between the words in a sentence is often called a `Bag of Words
			# representation <https://en.wikipedia.org/wiki/Bag-of-words_model>`_.

			`# %%`
			`# DictVectorizer`
			`# --------------`
			`#`
			# First we benchmark the :func:`~sklearn.feature_extraction.DictVectorizer`,
			# then we compare it to :func:`~sklearn.feature_extraction.FeatureHasher` as
			`# both of them receive dictionaries as input.`

			`from time import time`

			`from sklearn.feature_extraction import DictVectorizer`

			`dict_count_vectorizers = defaultdict(list)`

			`t0 = time()`
			`vectorizer = DictVectorizer()`
			`vectorizer.fit_transform(token_freqs(d) for d in raw_data)`
			`duration = time() - t0`
			`dict_count_vectorizers["vectorizer"].append(`
			`vectorizer.__class__.__name__ + "\non freq dicts"`
			`)`
			`dict_count_vectorizers["speed"].append(data_size_mb / duration)`
			`print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")`
			`print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")`

			`# %%`
			`# The actual mapping from text token to column index is explicitly stored in`
			# the `.vocabulary_` attribute which is a potentially very large Python
			`# dictionary:`
			`type(vectorizer.vocabulary_)`

			`# %%`
			`len(vectorizer.vocabulary_)`

			`# %%`
			`vectorizer.vocabulary_["example"]`

			`# %%`
			`# FeatureHasher`
			`# -------------`
			`#`
			`# Dictionaries take up a large amount of storage space and grow in size as the`
			`# training set grows. Instead of growing the vectors along with a dictionary,`
			`# feature hashing builds a vector of pre-defined length by applying a hash`
			# function `h` to the features (e.g., tokens), then using the hash values
			`# directly as feature indices and updating the resulting vector at those`
			`# indices. When the feature space is not large enough, hashing functions tend to`
			`# map distinct values to the same hash code (hash collisions). As a result, it`
			`# is impossible to determine what object generated any particular hash code.`
			`#`
			`# Because of the above it is impossible to recover the original tokens from the`
			`# feature matrix and the best approach to estimate the number of unique terms in`
			`# the original dictionary is to count the number of active columns in the`
			`# encoded feature matrix. For such a purpose we define the following function:`

			`import numpy as np`


			`def n_nonzero_columns(X):`
			`"""Number of columns with at least one non-zero value in a CSR matrix.`

			`This is useful to count the number of features columns that are effectively`
			`active when using the FeatureHasher.`
			`"""`
			`return len(np.unique(X.nonzero()[1]))`


			`# %%`
			`# The default number of features for the`
			# :func:`~sklearn.feature_extraction.FeatureHasher` is 2**20. Here we set
			# `n_features = 2**18` to illustrate hash collisions.
			`#`
			`# FeatureHasher on frequency dictionaries`

			`from sklearn.feature_extraction import FeatureHasher`

			`t0 = time()`
			`hasher = FeatureHasher(n_features=2**18)`
			`X = hasher.transform(token_freqs(d) for d in raw_data)`
			`duration = time() - t0`
			`dict_count_vectorizers["vectorizer"].append(`
			`hasher.__class__.__name__ + "\non freq dicts"`
			`)`
			`dict_count_vectorizers["speed"].append(data_size_mb / duration)`
			`print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")`
			`print(f"Found {n_nonzero_columns(X)} unique tokens")`

			`# %%`
			`# The number of unique tokens when using the`
			# :func:`~sklearn.feature_extraction.FeatureHasher` is lower than those obtained
			# using the :func:`~sklearn.feature_extraction.DictVectorizer`. This is due to
			`# hash collisions.`
			`#`
			`# The number of collisions can be reduced by increasing the feature space.`
			`# Notice that the speed of the vectorizer does not change significantly when`
			`# setting a large number of features, though it causes larger coefficient`
			`# dimensions and then requires more memory usage to store them, even if a`
			`# majority of them is inactive.`

			`t0 = time()`
			`hasher = FeatureHasher(n_features=2**22)`
			`X = hasher.transform(token_freqs(d) for d in raw_data)`
			`duration = time() - t0`

			`print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")`
			`print(f"Found {n_nonzero_columns(X)} unique tokens")`

			`# %%`
			`# We confirm that the number of unique tokens gets closer to the number of`
			# unique terms found by the :func:`~sklearn.feature_extraction.DictVectorizer`.
			`#`
			`# FeatureHasher on raw tokens`
			`#`
			# Alternatively, one can set `input_type="string"` in the
			# :func:`~sklearn.feature_extraction.FeatureHasher` to vectorize the strings
			# output directly from the customized `tokenize` function. This is equivalent to
			`# passing a dictionary with an implied frequency of 1 for each feature name.`

			`t0 = time()`
			`hasher = FeatureHasher(n_features=2**18, input_type="string")`
			`X = hasher.transform(tokenize(d) for d in raw_data)`
			`duration = time() - t0`
			`dict_count_vectorizers["vectorizer"].append(`
			`hasher.__class__.__name__ + "\non raw tokens"`
			`)`
			`dict_count_vectorizers["speed"].append(data_size_mb / duration)`
			`print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")`
			`print(f"Found {n_nonzero_columns(X)} unique tokens")`

			`# %%`
			`# We now plot the speed of the above methods for vectorizing.`

			`import matplotlib.pyplot as plt`

			`fig, ax = plt.subplots(figsize=(12, 6))`

			`y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))`
			`ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")`
			`ax.set_yticks(y_pos)`
			`ax.set_yticklabels(dict_count_vectorizers["vectorizer"])`
			`ax.invert_yaxis()`
			`_ = ax.set_xlabel("speed (MB/s)")`

			`# %%`
			# In both cases :func:`~sklearn.feature_extraction.FeatureHasher` is
			`# approximately twice as fast as`
			# :func:`~sklearn.feature_extraction.DictVectorizer`. This is handy when dealing
			`# with large amounts of data, with the downside of losing the invertibility of`
			`# the transformation, which in turn makes the interpretation of a model a more`
			`# complex task.`
			`#`
			# The `FeatureHeasher` with `input_type="string"` is slightly faster than the
			`# variant that works on frequency dict because it does not count repeated`
			`# tokens: each token is implicitly counted once, even if it was repeated.`
			`# Depending on the downstream machine learning task, it can be a limitation or`
			`# not.`
			`#`
			`# Comparison with special purpose text vectorizers`
			`# ------------------------------------------------`
			`#`
			# :func:`~sklearn.feature_extraction.text.CountVectorizer` accepts raw data as
			`# it internally implements tokenization and occurrence counting. It is similar`
			# to the :func:`~sklearn.feature_extraction.DictVectorizer` when used along with
			# the customized function `token_freqs` as done in the previous section. The
			# difference being that :func:`~sklearn.feature_extraction.text.CountVectorizer`
			`# is more flexible. In particular it accepts various regex patterns through the`
			# `token_pattern` parameter.

			`from sklearn.feature_extraction.text import CountVectorizer`

			`t0 = time()`
			`vectorizer = CountVectorizer()`
			`vectorizer.fit_transform(raw_data)`
			`duration = time() - t0`
			`dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)`
			`dict_count_vectorizers["speed"].append(data_size_mb / duration)`
			`print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")`
			`print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")`

			`# %%`
			# We see that using the :func:`~sklearn.feature_extraction.text.CountVectorizer`
			`# implementation is approximately twice as fast as using the`
			# :func:`~sklearn.feature_extraction.DictVectorizer` along with the simple
			`# function we defined for mapping the tokens. The reason is that`
			# :func:`~sklearn.feature_extraction.text.CountVectorizer` is optimized by
			`# reusing a compiled regular expression for the full training set instead of`
			`# creating one per document as done in our naive tokenize function.`
			`#`
			`# Now we make a similar experiment with the`
			# :func:`~sklearn.feature_extraction.text.HashingVectorizer`, which is
			`# equivalent to combining the "hashing trick" implemented by the`
			# :func:`~sklearn.feature_extraction.FeatureHasher` class and the text
			`# preprocessing and tokenization of the`
			# :func:`~sklearn.feature_extraction.text.CountVectorizer`.

			`from sklearn.feature_extraction.text import HashingVectorizer`

			`t0 = time()`
			`vectorizer = HashingVectorizer(n_features=2**18)`
			`vectorizer.fit_transform(raw_data)`
			`duration = time() - t0`
			`dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)`
			`dict_count_vectorizers["speed"].append(data_size_mb / duration)`
			`print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")`

			`# %%`
			`# We can observe that this is the fastest text tokenization strategy so far,`
			`# assuming that the downstream machine learning task can tolerate a few`
			`# collisions.`
			`#`
			`# TfidfVectorizer`
			`# ---------------`
			`#`
			`# In a large text corpus, some words appear with higher frequency (e.g. "the",`
			`# "a", "is" in English) and do not carry meaningful information about the actual`
			`# contents of a document. If we were to feed the word count data directly to a`
			`# classifier, those very common terms would shadow the frequencies of rarer yet`
			`# more informative terms. In order to re-weight the count features into floating`
			`# point values suitable for usage by a classifier it is very common to use the`
			`# tf-idf transform as implemented by the`
			# :func:`~sklearn.feature_extraction.text.TfidfTransformer`. TF stands for
			`# "term-frequency" while "tf-idf" means term-frequency times inverse`
			`# document-frequency.`
			`#`
			# We now benchmark the :func:`~sklearn.feature_extraction.text.TfidfVectorizer`,
			`# which is equivalent to combining the tokenization and occurrence counting of`
			# the :func:`~sklearn.feature_extraction.text.CountVectorizer` along with the
			`# normalizing and weighting from a`
			# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.

			`from sklearn.feature_extraction.text import TfidfVectorizer`

			`t0 = time()`
			`vectorizer = TfidfVectorizer()`
			`vectorizer.fit_transform(raw_data)`
			`duration = time() - t0`
			`dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)`
			`dict_count_vectorizers["speed"].append(data_size_mb / duration)`
			`print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")`
			`print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")`

			`# %%`
			`# Summary`
			`# -------`
			`# Let's conclude this notebook by summarizing all the recorded processing speeds`
			`# in a single plot:`

			`fig, ax = plt.subplots(figsize=(12, 6))`

			`y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))`
			`ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")`
			`ax.set_yticks(y_pos)`
			`ax.set_yticklabels(dict_count_vectorizers["vectorizer"])`
			`ax.invert_yaxis()`
			`_ = ax.set_xlabel("speed (MB/s)")`

			`# %%`
			`# Notice from the plot that`
			# :func:`~sklearn.feature_extraction.text.TfidfVectorizer` is slightly slower
			# than :func:`~sklearn.feature_extraction.text.CountVectorizer` because of the
			`# extra operation induced by the`
			# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.
			`#`
			# Also notice that, by setting the number of features `n_features = 2**18`, the
			# :func:`~sklearn.feature_extraction.text.HashingVectorizer` performs better
			# than the :func:`~sklearn.feature_extraction.text.CountVectorizer` at the
			`# expense of inversibility of the transformation due to hash collisions.`
			`#`
			# We highlight that :func:`~sklearn.feature_extraction.text.CountVectorizer` and
			# :func:`~sklearn.feature_extraction.text.HashingVectorizer` perform better than
			# their equivalent :func:`~sklearn.feature_extraction.DictVectorizer` and
			# :func:`~sklearn.feature_extraction.FeatureHasher` on manually tokenized
			`# documents since the internal tokenization step of the former vectorizers`
			`# compiles a regular expression once and then reuses it for all the documents.`