Spaces:

sklearn-docs
/

text-feature-extraction-evaluation

Sleeping

File size: 9,530 Bytes

import math
import json
import gradio as gr
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline

CATEGORIES = [
    "alt.atheism",
    "comp.graphics",
    "comp.os.ms-windows.misc",
    "comp.sys.ibm.pc.hardware",
    "comp.sys.mac.hardware",
    "comp.windows.x",
    "misc.forsale",
    "rec.autos",
    "rec.motorcycles",
    "rec.sport.baseball",
    "rec.sport.hockey",
    "sci.crypt",
    "sci.electronics",
    "sci.med",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns",
    "talk.politics.mideast",
    "talk.politics.misc",
    "talk.religion.misc",
]


def shorten_param(param_name):
    """Remove components' prefixes in param_name."""
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


def train_model(categories, vect__max_df, vect__min_df, vect__ngram_range, vect__norm):
    pipeline = Pipeline(
        [
            ("vect", TfidfVectorizer()),
            ("clf", ComplementNB()),
        ]
    )

    parameters_grid = {
        "vect__max_df": [eval(value) for value in vect__max_df.split(",")],
        "vect__min_df": [eval(value) for value in vect__min_df.split(",")],
        "vect__ngram_range": eval(vect__ngram_range),  # unigrams or bigrams
        "vect__norm": [value.strip() for value in vect__norm.split(",")],
        "clf__alpha": np.logspace(-6, 6, 13),
    }

    print(parameters_grid)

    data_train = fetch_20newsgroups(
        subset="train",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=("headers", "footers", "quotes"),
    )

    data_test = fetch_20newsgroups(
        subset="test",
        categories=categories,
        shuffle=True,
        random_state=42,
        remove=("headers", "footers", "quotes"),
    )

    pipeline = Pipeline(
        [
            ("vect", TfidfVectorizer()),
            ("clf", ComplementNB()),
        ]
    )

    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=parameters_grid,
        n_iter=40,
        random_state=0,
        n_jobs=2,
        verbose=1,
    )

    random_search.fit(data_train.data, data_train.target)
    best_parameters = json.dumps(
        random_search.best_estimator_.get_params(),
        indent=4,
        sort_keys=True,
        default=str,
    )

    test_accuracy = random_search.score(data_test.data, data_test.target)

    cv_results = pd.DataFrame(random_search.cv_results_)
    cv_results = cv_results.rename(shorten_param, axis=1)

    param_names = [shorten_param(name) for name in parameters_grid.keys()]
    labels = {
        "mean_score_time": "CV Score time (s)",
        "mean_test_score": "CV score (accuracy)",
    }
    fig = px.scatter(
        cv_results,
        x="mean_score_time",
        y="mean_test_score",
        error_x="std_score_time",
        error_y="std_test_score",
        hover_data=param_names,
        labels=labels,
    )
    fig.update_layout(
        title={
            "text": "trade-off between scoring time and mean test score",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        }
    )

    column_results = param_names + ["mean_test_score", "mean_score_time"]

    transform_funcs = dict.fromkeys(column_results, lambda x: x)
    # Using a logarithmic scale for alpha
    transform_funcs["alpha"] = math.log10
    # L1 norms are mapped to index 1, and L2 norms to index 2
    transform_funcs["norm"] = lambda x: 2 if x == "l2" else 1
    # Unigrams are mapped to index 1 and bigrams to index 2
    transform_funcs["ngram_range"] = lambda x: x[1]

    fig2 = px.parallel_coordinates(
        cv_results[column_results].apply(transform_funcs),
        color="mean_test_score",
        color_continuous_scale=px.colors.sequential.Viridis_r,
        labels=labels,
    )
    fig2.update_layout(
        title={
            "text": "Parallel coordinates plot of text classifier pipeline",
            "y": 0.99,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        }
    )

    return fig, fig2, best_parameters, test_accuracy


def load_description(name):
    with open(f"./descriptions/{name}.md", "r") as f:
        return f.read()


AUTHOR = """
Created by [@dominguesm](https://huggingface.co/dominguesm) based on [scikit-learn docs](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_text_feature_extraction.html)
"""


with gr.Blocks(theme=gr.themes.Soft()) as app:
    with gr.Row():
        with gr.Column():
            gr.Markdown("# Sample pipeline for text feature extraction and evaluation")
            gr.Markdown(load_description("description_part1"))
            gr.Markdown(load_description("description_part2"))
            gr.Markdown(AUTHOR)

    with gr.Row():
        with gr.Column():
            gr.Markdown("""## CATEGORY SELECTION""")
            gr.Markdown(load_description("description_category_selection"))
            drop_categories = gr.Dropdown(
                CATEGORIES,
                value=["alt.atheism", "talk.religion.misc"],
                multiselect=True,
                label="Categories",
                info="Please select up to two categories that you want to receive training on.",
                max_choices=2,
                interactive=True,
            )
    with gr.Row():
        with gr.Column():
            gr.Markdown("""## PARAMETERS GRID""")
            gr.Markdown(load_description("description_parameter_grid"))
            with gr.Column():
                gr.Markdown("""### Classifier Alpha""")
                gr.Markdown(load_description("parameter_grid/alpha"))

                clf__alpha = gr.Textbox(
                    label="clf__alpha",
                    value="1.e-06, 1.e-05, 1.e-04",
                    info="Due to practical considerations, this parameter was kept constant.",
                    interactive=False,
                )

            with gr.Column():
                gr.Markdown("""### Vectorizer max_df""")
                gr.Markdown(load_description("parameter_grid/max_df"))

                vect__max_df = gr.Textbox(
                    label="vect__max_df",
                    value="0.2, 0.4, 0.6, 0.8, 1.0",
                    info="Values ranging from 0 to 1.0, separated by a comma.",
                    interactive=True,
                )

            with gr.Column():
                gr.Markdown("""### Vectorizer min_df""")
                gr.Markdown(load_description("parameter_grid/min_df"))

                vect__min_df = gr.Textbox(
                    label="vect__min_df",
                    value="1, 3, 5, 10",
                    info="Values ranging from 0 to 1.0, separated by a comma, or integers separated by a comma. If float, the parameter represents a proportion of documents, integer absolute counts.",
                    interactive=True,
                )
            with gr.Column():
                gr.Markdown("""### Vectorizer ngram_range""")
                gr.Markdown(load_description("parameter_grid/ngram_range"))

                vect__ngram_range = gr.Textbox(
                    label="vect__ngram_range",
                    value="(1, 1), (1, 2)",
                    info="""Tuples of integer values separated by a comma. For example an ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means only bigrams.""",
                    interactive=True,
                )
            with gr.Column():
                gr.Markdown("""### Vectorizer norm""")
                gr.Markdown(load_description("parameter_grid/norm"))
                gr.Markdown(
                    """- 'l2': Sum of squares of vector elements is 1. The cosine
                            similarity between two vectors is their dot product when l2 norm has
                            been applied.
                            - 'l1': Sum of absolute values of vector elements is 1."""
                )

                vect__norm = gr.Textbox(
                    label="vect__norm",
                    value="l1, l2",
                    info="'l1' or 'l2', separated by a comma",
                    interactive=True,
                )

    with gr.Row():
        gr.Markdown(
            """
            ## MODEL PIPELINE
            ```python
            pipeline = Pipeline(
                [
                    ("vect", TfidfVectorizer()),
                    ("clf", ComplementNB()),
                ]
            )
            ```
            """
        )
    with gr.Row():
        with gr.Column():
            gr.Markdown("""## TRAINING""")
            with gr.Row():
                brn_train = gr.Button("Train").style(container=False)

    gr.Markdown("## RESULTS")
    with gr.Row():
        best_parameters = gr.Textbox(label="Best parameters")
        test_accuracy = gr.Textbox(label="Test accuracy")

    plot_trade = gr.Plot(label="")
    plot_coordinates = gr.Plot(label="")

    brn_train.click(
        train_model,
        [drop_categories, vect__max_df, vect__min_df, vect__ngram_range, vect__norm],
        [plot_trade, plot_coordinates, best_parameters, test_accuracy],
    )

app.launch()