sklearn/examples/text/plot_hashing_vs_dict_vector...

386 lines
15 KiB
Python
Raw Normal View History

2024-08-05 09:32:03 +02:00
"""
===========================================
FeatureHasher and DictVectorizer Comparison
===========================================
In this example we illustrate text vectorization, which is the process of
representing non-numerical input data (such as dictionaries or text documents)
as vectors of real numbers.
We first compare :func:`~sklearn.feature_extraction.FeatureHasher` and
:func:`~sklearn.feature_extraction.DictVectorizer` by using both methods to
vectorize text documents that are preprocessed (tokenized) with the help of a
custom Python function.
Later we introduce and analyze the text-specific vectorizers
:func:`~sklearn.feature_extraction.text.HashingVectorizer`,
:func:`~sklearn.feature_extraction.text.CountVectorizer` and
:func:`~sklearn.feature_extraction.text.TfidfVectorizer` that handle both the
tokenization and the assembling of the feature matrix within a single class.
The objective of the example is to demonstrate the usage of text vectorization
API and to compare their processing time. See the example scripts
:ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`
and :ref:`sphx_glr_auto_examples_text_plot_document_clustering.py` for actual
learning on text documents.
"""
# Author: Lars Buitinck
# Olivier Grisel <olivier.grisel@ensta.org>
# Arturo Amor <david-arturo.amor-quiroz@inria.fr>
# License: BSD 3 clause
# %%
# Load Data
# ---------
#
# We load data from :ref:`20newsgroups_dataset`, which comprises around
# 18000 newsgroups posts on 20 topics split in two subsets: one for training and
# one for testing. For the sake of simplicity and reducing the computational
# cost, we select a subset of 7 topics and use the training set only.
from sklearn.datasets import fetch_20newsgroups
categories = [
"alt.atheism",
"comp.graphics",
"comp.sys.ibm.pc.hardware",
"misc.forsale",
"rec.autos",
"sci.space",
"talk.religion.misc",
]
print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")
# %%
# Define preprocessing functions
# ------------------------------
#
# A token may be a word, part of a word or anything comprised between spaces or
# symbols in a string. Here we define a function that extracts the tokens using
# a simple regular expression (regex) that matches Unicode word characters. This
# includes most characters that can be part of a word in any language, as well
# as numbers and the underscore:
import re
def tokenize(doc):
"""Extract tokens from doc.
This uses a simple regex that matches word characters to break strings
into tokens. For a more principled approach, see CountVectorizer or
TfidfVectorizer.
"""
return (tok.lower() for tok in re.findall(r"\w+", doc))
list(tokenize("This is a simple example, isn't it?"))
# %%
# We define an additional function that counts the (frequency of) occurrence of
# each token in a given document. It returns a frequency dictionary to be used
# by the vectorizers.
from collections import defaultdict
def token_freqs(doc):
"""Extract a dict mapping tokens from doc to their occurrences."""
freq = defaultdict(int)
for tok in tokenize(doc):
freq[tok] += 1
return freq
token_freqs("That is one example, but this is another one")
# %%
# Observe in particular that the repeated token `"is"` is counted twice for
# instance.
#
# Breaking a text document into word tokens, potentially losing the order
# information between the words in a sentence is often called a `Bag of Words
# representation <https://en.wikipedia.org/wiki/Bag-of-words_model>`_.
# %%
# DictVectorizer
# --------------
#
# First we benchmark the :func:`~sklearn.feature_extraction.DictVectorizer`,
# then we compare it to :func:`~sklearn.feature_extraction.FeatureHasher` as
# both of them receive dictionaries as input.
from time import time
from sklearn.feature_extraction import DictVectorizer
dict_count_vectorizers = defaultdict(list)
t0 = time()
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
vectorizer.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
# %%
# The actual mapping from text token to column index is explicitly stored in
# the `.vocabulary_` attribute which is a potentially very large Python
# dictionary:
type(vectorizer.vocabulary_)
# %%
len(vectorizer.vocabulary_)
# %%
vectorizer.vocabulary_["example"]
# %%
# FeatureHasher
# -------------
#
# Dictionaries take up a large amount of storage space and grow in size as the
# training set grows. Instead of growing the vectors along with a dictionary,
# feature hashing builds a vector of pre-defined length by applying a hash
# function `h` to the features (e.g., tokens), then using the hash values
# directly as feature indices and updating the resulting vector at those
# indices. When the feature space is not large enough, hashing functions tend to
# map distinct values to the same hash code (hash collisions). As a result, it
# is impossible to determine what object generated any particular hash code.
#
# Because of the above it is impossible to recover the original tokens from the
# feature matrix and the best approach to estimate the number of unique terms in
# the original dictionary is to count the number of active columns in the
# encoded feature matrix. For such a purpose we define the following function:
import numpy as np
def n_nonzero_columns(X):
"""Number of columns with at least one non-zero value in a CSR matrix.
This is useful to count the number of features columns that are effectively
active when using the FeatureHasher.
"""
return len(np.unique(X.nonzero()[1]))
# %%
# The default number of features for the
# :func:`~sklearn.feature_extraction.FeatureHasher` is 2**20. Here we set
# `n_features = 2**18` to illustrate hash collisions.
#
# **FeatureHasher on frequency dictionaries**
from sklearn.feature_extraction import FeatureHasher
t0 = time()
hasher = FeatureHasher(n_features=2**18)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
hasher.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")
# %%
# The number of unique tokens when using the
# :func:`~sklearn.feature_extraction.FeatureHasher` is lower than those obtained
# using the :func:`~sklearn.feature_extraction.DictVectorizer`. This is due to
# hash collisions.
#
# The number of collisions can be reduced by increasing the feature space.
# Notice that the speed of the vectorizer does not change significantly when
# setting a large number of features, though it causes larger coefficient
# dimensions and then requires more memory usage to store them, even if a
# majority of them is inactive.
t0 = time()
hasher = FeatureHasher(n_features=2**22)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")
# %%
# We confirm that the number of unique tokens gets closer to the number of
# unique terms found by the :func:`~sklearn.feature_extraction.DictVectorizer`.
#
# **FeatureHasher on raw tokens**
#
# Alternatively, one can set `input_type="string"` in the
# :func:`~sklearn.feature_extraction.FeatureHasher` to vectorize the strings
# output directly from the customized `tokenize` function. This is equivalent to
# passing a dictionary with an implied frequency of 1 for each feature name.
t0 = time()
hasher = FeatureHasher(n_features=2**18, input_type="string")
X = hasher.transform(tokenize(d) for d in raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(
hasher.__class__.__name__ + "\non raw tokens"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {n_nonzero_columns(X)} unique tokens")
# %%
# We now plot the speed of the above methods for vectorizing.
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(12, 6))
y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))
ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels(dict_count_vectorizers["vectorizer"])
ax.invert_yaxis()
_ = ax.set_xlabel("speed (MB/s)")
# %%
# In both cases :func:`~sklearn.feature_extraction.FeatureHasher` is
# approximately twice as fast as
# :func:`~sklearn.feature_extraction.DictVectorizer`. This is handy when dealing
# with large amounts of data, with the downside of losing the invertibility of
# the transformation, which in turn makes the interpretation of a model a more
# complex task.
#
# The `FeatureHeasher` with `input_type="string"` is slightly faster than the
# variant that works on frequency dict because it does not count repeated
# tokens: each token is implicitly counted once, even if it was repeated.
# Depending on the downstream machine learning task, it can be a limitation or
# not.
#
# Comparison with special purpose text vectorizers
# ------------------------------------------------
#
# :func:`~sklearn.feature_extraction.text.CountVectorizer` accepts raw data as
# it internally implements tokenization and occurrence counting. It is similar
# to the :func:`~sklearn.feature_extraction.DictVectorizer` when used along with
# the customized function `token_freqs` as done in the previous section. The
# difference being that :func:`~sklearn.feature_extraction.text.CountVectorizer`
# is more flexible. In particular it accepts various regex patterns through the
# `token_pattern` parameter.
from sklearn.feature_extraction.text import CountVectorizer
t0 = time()
vectorizer = CountVectorizer()
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
# %%
# We see that using the :func:`~sklearn.feature_extraction.text.CountVectorizer`
# implementation is approximately twice as fast as using the
# :func:`~sklearn.feature_extraction.DictVectorizer` along with the simple
# function we defined for mapping the tokens. The reason is that
# :func:`~sklearn.feature_extraction.text.CountVectorizer` is optimized by
# reusing a compiled regular expression for the full training set instead of
# creating one per document as done in our naive tokenize function.
#
# Now we make a similar experiment with the
# :func:`~sklearn.feature_extraction.text.HashingVectorizer`, which is
# equivalent to combining the "hashing trick" implemented by the
# :func:`~sklearn.feature_extraction.FeatureHasher` class and the text
# preprocessing and tokenization of the
# :func:`~sklearn.feature_extraction.text.CountVectorizer`.
from sklearn.feature_extraction.text import HashingVectorizer
t0 = time()
vectorizer = HashingVectorizer(n_features=2**18)
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
# %%
# We can observe that this is the fastest text tokenization strategy so far,
# assuming that the downstream machine learning task can tolerate a few
# collisions.
#
# TfidfVectorizer
# ---------------
#
# In a large text corpus, some words appear with higher frequency (e.g. "the",
# "a", "is" in English) and do not carry meaningful information about the actual
# contents of a document. If we were to feed the word count data directly to a
# classifier, those very common terms would shadow the frequencies of rarer yet
# more informative terms. In order to re-weight the count features into floating
# point values suitable for usage by a classifier it is very common to use the
# tf-idf transform as implemented by the
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`. TF stands for
# "term-frequency" while "tf-idf" means term-frequency times inverse
# document-frequency.
#
# We now benchmark the :func:`~sklearn.feature_extraction.text.TfidfVectorizer`,
# which is equivalent to combining the tokenization and occurrence counting of
# the :func:`~sklearn.feature_extraction.text.CountVectorizer` along with the
# normalizing and weighting from a
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.
from sklearn.feature_extraction.text import TfidfVectorizer
t0 = time()
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(raw_data)
duration = time() - t0
dict_count_vectorizers["vectorizer"].append(vectorizer.__class__.__name__)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f} s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")
# %%
# Summary
# -------
# Let's conclude this notebook by summarizing all the recorded processing speeds
# in a single plot:
fig, ax = plt.subplots(figsize=(12, 6))
y_pos = np.arange(len(dict_count_vectorizers["vectorizer"]))
ax.barh(y_pos, dict_count_vectorizers["speed"], align="center")
ax.set_yticks(y_pos)
ax.set_yticklabels(dict_count_vectorizers["vectorizer"])
ax.invert_yaxis()
_ = ax.set_xlabel("speed (MB/s)")
# %%
# Notice from the plot that
# :func:`~sklearn.feature_extraction.text.TfidfVectorizer` is slightly slower
# than :func:`~sklearn.feature_extraction.text.CountVectorizer` because of the
# extra operation induced by the
# :func:`~sklearn.feature_extraction.text.TfidfTransformer`.
#
# Also notice that, by setting the number of features `n_features = 2**18`, the
# :func:`~sklearn.feature_extraction.text.HashingVectorizer` performs better
# than the :func:`~sklearn.feature_extraction.text.CountVectorizer` at the
# expense of inversibility of the transformation due to hash collisions.
#
# We highlight that :func:`~sklearn.feature_extraction.text.CountVectorizer` and
# :func:`~sklearn.feature_extraction.text.HashingVectorizer` perform better than
# their equivalent :func:`~sklearn.feature_extraction.DictVectorizer` and
# :func:`~sklearn.feature_extraction.FeatureHasher` on manually tokenized
# documents since the internal tokenization step of the former vectorizers
# compiles a regular expression once and then reuses it for all the documents.