sklearn/examples/compose/plot_column_transformer.py

"""
==================================================
Column Transformer with Heterogeneous Data Sources
==================================================

Datasets can often contain components that require different feature
extraction and processing pipelines. This scenario might occur when:

1. your dataset consists of heterogeneous data types (e.g. raster images and
   text captions),
2. your dataset is stored in a :class:`pandas.DataFrame` and different columns
   require different processing pipelines.

This example demonstrates how to use
:class:`~sklearn.compose.ColumnTransformer` on a dataset containing
different types of features. The choice of features is not particularly
helpful, but serves to illustrate the technique.

"""

# Author: Matt Terry <matt.terry@gmail.com>
#
# License: BSD 3 clause

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import LinearSVC

##############################################################################
# 20 newsgroups dataset
# ---------------------
#
# We will use the :ref:`20 newsgroups dataset <20newsgroups_dataset>`, which
# comprises posts from newsgroups on 20 topics. This dataset is split
# into train and test subsets based on messages posted before and after
# a specific date. We will only use posts from 2 categories to speed up running
# time.

categories = ["sci.med", "sci.space"]
X_train, y_train = fetch_20newsgroups(
    random_state=1,
    subset="train",
    categories=categories,
    remove=("footers", "quotes"),
    return_X_y=True,
)
X_test, y_test = fetch_20newsgroups(
    random_state=1,
    subset="test",
    categories=categories,
    remove=("footers", "quotes"),
    return_X_y=True,
)

##############################################################################
# Each feature comprises meta information about that post, such as the subject,
# and the body of the news post.

print(X_train[0])

##############################################################################
# Creating transformers
# ---------------------
#
# First, we would like a transformer that extracts the subject and
# body of each post. Since this is a stateless transformation (does not
# require state information from training data), we can define a function that
# performs the data transformation then use
# :class:`~sklearn.preprocessing.FunctionTransformer` to create a scikit-learn
# transformer.


def subject_body_extractor(posts):
    # construct object dtype array with two columns
    # first column = 'subject' and second column = 'body'
    features = np.empty(shape=(len(posts), 2), dtype=object)
    for i, text in enumerate(posts):
        # temporary variable `_` stores '\n\n'
        headers, _, body = text.partition("\n\n")
        # store body text in second column
        features[i, 1] = body

        prefix = "Subject:"
        sub = ""
        # save text after 'Subject:' in first column
        for line in headers.split("\n"):
            if line.startswith(prefix):
                sub = line[len(prefix) :]
                break
        features[i, 0] = sub

    return features


subject_body_transformer = FunctionTransformer(subject_body_extractor)

##############################################################################
# We will also create a transformer that extracts the
# length of the text and the number of sentences.


def text_stats(posts):
    return [{"length": len(text), "num_sentences": text.count(".")} for text in posts]


text_stats_transformer = FunctionTransformer(text_stats)

##############################################################################
# Classification pipeline
# -----------------------
#
# The pipeline below extracts the subject and body from each post using
# ``SubjectBodyExtractor``, producing a (n_samples, 2) array. This array is
# then used to compute standard bag-of-words features for the subject and body
# as well as text length and number of sentences on the body, using
# ``ColumnTransformer``. We combine them, with weights, then train a
# classifier on the combined set of features.

pipeline = Pipeline(
    [
        # Extract subject & body
        ("subjectbody", subject_body_transformer),
        # Use ColumnTransformer to combine the subject and body features
        (
            "union",
            ColumnTransformer(
                [
                    # bag-of-words for subject (col 0)
                    ("subject", TfidfVectorizer(min_df=50), 0),
                    # bag-of-words with decomposition for body (col 1)
                    (
                        "body_bow",
                        Pipeline(
                            [
                                ("tfidf", TfidfVectorizer()),
                                ("best", PCA(n_components=50, svd_solver="arpack")),
                            ]
                        ),
                        1,
                    ),
                    # Pipeline for pulling text stats from post's body
                    (
                        "body_stats",
                        Pipeline(
                            [
                                (
                                    "stats",
                                    text_stats_transformer,
                                ),  # returns a list of dicts
                                (
                                    "vect",
                                    DictVectorizer(),
                                ),  # list of dicts -> feature matrix
                            ]
                        ),
                        1,
                    ),
                ],
                # weight above ColumnTransformer features
                transformer_weights={
                    "subject": 0.8,
                    "body_bow": 0.5,
                    "body_stats": 1.0,
                },
            ),
        ),
        # Use a SVC classifier on the combined features
        ("svc", LinearSVC(dual=False)),
    ],
    verbose=True,
)

##############################################################################
# Finally, we fit our pipeline on the training data and use it to predict
# topics for ``X_test``. Performance metrics of our pipeline are then printed.

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("Classification report:\n\n{}".format(classification_report(y_test, y_pred)))
first commit 2024-08-05 09:32:03 +02:00			`"""`
			`==================================================`
			`Column Transformer with Heterogeneous Data Sources`
			`==================================================`

			`Datasets can often contain components that require different feature`
			`extraction and processing pipelines. This scenario might occur when:`

			`1. your dataset consists of heterogeneous data types (e.g. raster images and`
			`text captions),`
			2. your dataset is stored in a :class:`pandas.DataFrame` and different columns
			`require different processing pipelines.`

			`This example demonstrates how to use`
			:class:`~sklearn.compose.ColumnTransformer` on a dataset containing
			`different types of features. The choice of features is not particularly`
			`helpful, but serves to illustrate the technique.`

			`"""`

			`# Author: Matt Terry <matt.terry@gmail.com>`
			`#`
			`# License: BSD 3 clause`

			`import numpy as np`

			`from sklearn.compose import ColumnTransformer`
			`from sklearn.datasets import fetch_20newsgroups`
			`from sklearn.decomposition import PCA`
			`from sklearn.feature_extraction import DictVectorizer`
			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.metrics import classification_report`
			`from sklearn.pipeline import Pipeline`
			`from sklearn.preprocessing import FunctionTransformer`
			`from sklearn.svm import LinearSVC`

			`##############################################################################`
			`# 20 newsgroups dataset`
			`# ---------------------`
			`#`
			# We will use the :ref:`20 newsgroups dataset <20newsgroups_dataset>`, which
			`# comprises posts from newsgroups on 20 topics. This dataset is split`
			`# into train and test subsets based on messages posted before and after`
			`# a specific date. We will only use posts from 2 categories to speed up running`
			`# time.`

			`categories = ["sci.med", "sci.space"]`
			`X_train, y_train = fetch_20newsgroups(`
			`random_state=1,`
			`subset="train",`
			`categories=categories,`
			`remove=("footers", "quotes"),`
			`return_X_y=True,`
			`)`
			`X_test, y_test = fetch_20newsgroups(`
			`random_state=1,`
			`subset="test",`
			`categories=categories,`
			`remove=("footers", "quotes"),`
			`return_X_y=True,`
			`)`

			`##############################################################################`
			`# Each feature comprises meta information about that post, such as the subject,`
			`# and the body of the news post.`

			`print(X_train[0])`

			`##############################################################################`
			`# Creating transformers`
			`# ---------------------`
			`#`
			`# First, we would like a transformer that extracts the subject and`
			`# body of each post. Since this is a stateless transformation (does not`
			`# require state information from training data), we can define a function that`
			`# performs the data transformation then use`
			# :class:`~sklearn.preprocessing.FunctionTransformer` to create a scikit-learn
			`# transformer.`


			`def subject_body_extractor(posts):`
			`# construct object dtype array with two columns`
			`# first column = 'subject' and second column = 'body'`
			`features = np.empty(shape=(len(posts), 2), dtype=object)`
			`for i, text in enumerate(posts):`
			# temporary variable `_` stores '\n\n'
			`headers, _, body = text.partition("\n\n")`
			`# store body text in second column`
			`features[i, 1] = body`

			`prefix = "Subject:"`
			`sub = ""`
			`# save text after 'Subject:' in first column`
			`for line in headers.split("\n"):`
			`if line.startswith(prefix):`
			`sub = line[len(prefix) :]`
			`break`
			`features[i, 0] = sub`

			`return features`


			`subject_body_transformer = FunctionTransformer(subject_body_extractor)`

			`##############################################################################`
			`# We will also create a transformer that extracts the`
			`# length of the text and the number of sentences.`


			`def text_stats(posts):`
			`return [{"length": len(text), "num_sentences": text.count(".")} for text in posts]`


			`text_stats_transformer = FunctionTransformer(text_stats)`

			`##############################################################################`
			`# Classification pipeline`
			`# -----------------------`
			`#`
			`# The pipeline below extracts the subject and body from each post using`
			# ``SubjectBodyExtractor``, producing a (n_samples, 2) array. This array is
			`# then used to compute standard bag-of-words features for the subject and body`
			`# as well as text length and number of sentences on the body, using`
			# ``ColumnTransformer``. We combine them, with weights, then train a
			`# classifier on the combined set of features.`

			`pipeline = Pipeline(`
			`[`
			`# Extract subject & body`
			`("subjectbody", subject_body_transformer),`
			`# Use ColumnTransformer to combine the subject and body features`
			`(`
			`"union",`
			`ColumnTransformer(`
			`[`
			`# bag-of-words for subject (col 0)`
			`("subject", TfidfVectorizer(min_df=50), 0),`
			`# bag-of-words with decomposition for body (col 1)`
			`(`
			`"body_bow",`
			`Pipeline(`
			`[`
			`("tfidf", TfidfVectorizer()),`
			`("best", PCA(n_components=50, svd_solver="arpack")),`
			`]`
			`),`
			`1,`
			`),`
			`# Pipeline for pulling text stats from post's body`
			`(`
			`"body_stats",`
			`Pipeline(`
			`[`
			`(`
			`"stats",`
			`text_stats_transformer,`
			`), # returns a list of dicts`
			`(`
			`"vect",`
			`DictVectorizer(),`
			`), # list of dicts -> feature matrix`
			`]`
			`),`
			`1,`
			`),`
			`],`
			`# weight above ColumnTransformer features`
			`transformer_weights={`
			`"subject": 0.8,`
			`"body_bow": 0.5,`
			`"body_stats": 1.0,`
			`},`
			`),`
			`),`
			`# Use a SVC classifier on the combined features`
			`("svc", LinearSVC(dual=False)),`
			`],`
			`verbose=True,`
			`)`

			`##############################################################################`
			`# Finally, we fit our pipeline on the training data and use it to predict`
			# topics for ``X_test``. Performance metrics of our pipeline are then printed.

			`pipeline.fit(X_train, y_train)`
			`y_pred = pipeline.predict(X_test)`
			`print("Classification report:\n\n{}".format(classification_report(y_test, y_pred)))`