dominguesm's picture
Reorder Gradio install
0d3100c
import json
import math
import os
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.26.0")
import gradio as gr
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
CATEGORIES = [
"alt.atheism",
"comp.graphics",
"comp.os.ms-windows.misc",
"comp.sys.ibm.pc.hardware",
"comp.sys.mac.hardware",
"comp.windows.x",
"misc.forsale",
"rec.autos",
"rec.motorcycles",
"rec.sport.baseball",
"rec.sport.hockey",
"sci.crypt",
"sci.electronics",
"sci.med",
"sci.space",
"soc.religion.christian",
"talk.politics.guns",
"talk.politics.mideast",
"talk.politics.misc",
"talk.religion.misc",
]
def shorten_param(param_name):
"""Remove components' prefixes in param_name."""
if "__" in param_name:
return param_name.rsplit("__", 1)[1]
return param_name
def train_model(categories, vect__max_df, vect__min_df, vect__ngram_range, vect__norm):
pipeline = Pipeline(
[
("vect", TfidfVectorizer()),
("clf", ComplementNB()),
]
)
parameters_grid = {
"vect__max_df": [eval(value) for value in vect__max_df.split(",")],
"vect__min_df": [eval(value) for value in vect__min_df.split(",")],
"vect__ngram_range": eval(vect__ngram_range), # unigrams or bigrams
"vect__norm": [value.strip() for value in vect__norm.split(",")],
"clf__alpha": np.logspace(-6, 6, 13),
}
print(parameters_grid)
data_train = fetch_20newsgroups(
subset="train",
categories=categories,
shuffle=True,
random_state=42,
remove=("headers", "footers", "quotes"),
)
data_test = fetch_20newsgroups(
subset="test",
categories=categories,
shuffle=True,
random_state=42,
remove=("headers", "footers", "quotes"),
)
pipeline = Pipeline(
[
("vect", TfidfVectorizer()),
("clf", ComplementNB()),
]
)
random_search = RandomizedSearchCV(
estimator=pipeline,
param_distributions=parameters_grid,
n_iter=40,
random_state=0,
n_jobs=2,
verbose=1,
)
random_search.fit(data_train.data, data_train.target)
best_parameters = json.dumps(
random_search.best_estimator_.get_params(),
indent=4,
sort_keys=True,
default=str,
)
test_accuracy = random_search.score(data_test.data, data_test.target)
cv_results = pd.DataFrame(random_search.cv_results_)
cv_results = cv_results.rename(shorten_param, axis=1)
param_names = [shorten_param(name) for name in parameters_grid.keys()]
labels = {
"mean_score_time": "CV Score time (s)",
"mean_test_score": "CV score (accuracy)",
}
fig = px.scatter(
cv_results,
x="mean_score_time",
y="mean_test_score",
error_x="std_score_time",
error_y="std_test_score",
hover_data=param_names,
labels=labels,
)
fig.update_layout(
title={
"text": "trade-off between scoring time and mean test score",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
}
)
column_results = param_names + ["mean_test_score", "mean_score_time"]
transform_funcs = dict.fromkeys(column_results, lambda x: x)
# Using a logarithmic scale for alpha
transform_funcs["alpha"] = math.log10
# L1 norms are mapped to index 1, and L2 norms to index 2
transform_funcs["norm"] = lambda x: 2 if x == "l2" else 1
# Unigrams are mapped to index 1 and bigrams to index 2
transform_funcs["ngram_range"] = lambda x: x[1]
fig2 = px.parallel_coordinates(
cv_results[column_results].apply(transform_funcs),
color="mean_test_score",
color_continuous_scale=px.colors.sequential.Viridis_r,
labels=labels,
)
fig2.update_layout(
title={
"text": "Parallel coordinates plot of text classifier pipeline",
"y": 0.99,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
}
)
return fig, fig2, best_parameters, test_accuracy
def load_description(name):
with open(f"./descriptions/{name}.md", "r") as f:
return f.read()
AUTHOR = """
Created by [@dominguesm](https://huggingface.co/dominguesm) based on [scikit-learn docs](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_text_feature_extraction.html)
"""
with gr.Blocks(theme=gr.themes.Soft()) as app:
with gr.Row():
with gr.Column():
gr.Markdown("# Sample pipeline for text feature extraction and evaluation")
gr.Markdown(load_description("description_part1"))
gr.Markdown(load_description("description_part2"))
gr.Markdown(AUTHOR)
with gr.Row():
with gr.Column():
gr.Markdown("""## CATEGORY SELECTION""")
gr.Markdown(load_description("description_category_selection"))
drop_categories = gr.Dropdown(
CATEGORIES,
value=["alt.atheism", "talk.religion.misc"],
multiselect=True,
label="Categories",
info="Please select up to two categories that you want to receive training on.",
max_choices=2,
interactive=True,
)
with gr.Row():
with gr.Tab("PARAMETERS GRID"):
gr.Markdown(load_description("description_parameter_grid"))
with gr.Row():
with gr.Column():
clf__alpha = gr.Textbox(
label="Classifier Alpha (clf__alpha)",
value="1.e-06, 1.e-05, 1.e-04",
info="Due to practical considerations, this parameter was kept constant.",
interactive=False,
)
vect__max_df = gr.Textbox(
label="Vectorizer max_df (vect__max_df)",
value="0.2, 0.4, 0.6, 0.8, 1.0",
info="Values ranging from 0 to 1.0, separated by a comma.",
interactive=True,
)
vect__min_df = gr.Textbox(
label="Vectorizer min_df (vect__min_df)",
value="1, 3, 5, 10",
info="Values ranging from 0 to 1.0, separated by a comma, or integers separated by a comma. If float, the parameter represents a proportion of documents, integer absolute counts.",
interactive=True,
)
with gr.Column():
vect__ngram_range = gr.Textbox(
label="Vectorizer ngram_range (vect__ngram_range)",
value="(1, 1), (1, 2)",
info="""Tuples of integer values separated by a comma. For example an `ngram_range` of `(1, 1)` means only unigrams, `(1, 2)` means unigrams and bigrams, and `(2, 2)` means only bigrams.""",
interactive=True,
)
vect__norm = gr.Textbox(
label="Vectorizer norm (vect__norm)",
value="l1, l2",
info="'l1' or 'l2', separated by a comma",
interactive=True,
)
with gr.Tab("DESCRIPTION OF PARAMETERS"):
gr.Markdown("""### Classifier Alpha""")
gr.Markdown(load_description("parameter_grid/alpha"))
gr.Markdown("""### Vectorizer max_df""")
gr.Markdown(load_description("parameter_grid/max_df"))
gr.Markdown("""### Vectorizer min_df""")
gr.Markdown(load_description("parameter_grid/min_df"))
gr.Markdown("""### Vectorizer ngram_range""")
gr.Markdown(load_description("parameter_grid/ngram_range"))
gr.Markdown("""### Vectorizer norm""")
gr.Markdown(load_description("parameter_grid/norm"))
with gr.Row():
gr.Markdown(
"""
## MODEL PIPELINE
```python
pipeline = Pipeline(
[
("vect", TfidfVectorizer()),
("clf", ComplementNB()),
]
)
```
"""
)
with gr.Row():
with gr.Column():
gr.Markdown("""## TRAINING""")
with gr.Row():
brn_train = gr.Button("Train").style(container=False)
gr.Markdown("## RESULTS")
with gr.Row():
best_parameters = gr.Textbox(label="Best parameters")
test_accuracy = gr.Textbox(label="Test accuracy")
plot_trade = gr.Plot(label="")
plot_coordinates = gr.Plot(label="")
brn_train.click(
train_model,
inputs=[
drop_categories,
vect__max_df,
vect__min_df,
vect__ngram_range,
vect__norm,
],
outputs=[plot_trade, plot_coordinates, best_parameters, test_accuracy],
)
app.launch()