Spaces:

sklearn-docs
/

text-feature-extraction-evaluation

Sleeping

App Files Files Community

dominguesm commited on May 16, 2023

Commit

3a4d722

1 Parent(s): 30ae784

Add parameter grid config

Browse files

Files changed (1) hide show

app.py +85 -52

app.py CHANGED Viewed

@@ -34,15 +34,6 @@ CATEGORIES = [
 ]
-PARAMETER_GRID = {
-    "vect__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
-    "vect__min_df": (1, 3, 5, 10),
-    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
-    "vect__norm": ("l1", "l2"),
-    "clf__alpha": np.logspace(-6, 6, 13),
-}
 def shorten_param(param_name):
     """Remove components' prefixes in param_name."""
     if "__" in param_name:
@@ -50,7 +41,7 @@ def shorten_param(param_name):
     return param_name
-def train_model(categories):
     pipeline = Pipeline(
         [
             ("vect", TfidfVectorizer()),
@@ -58,6 +49,16 @@ def train_model(categories):
         ]
     )
     data_train = fetch_20newsgroups(
         subset="train",
         categories=categories,
@@ -83,7 +84,7 @@ def train_model(categories):
     random_search = RandomizedSearchCV(
         estimator=pipeline,
-        param_distributions=PARAMETER_GRID,
         n_iter=40,
         random_state=0,
         n_jobs=2,
@@ -103,7 +104,7 @@ def train_model(categories):
     cv_results = pd.DataFrame(random_search.cv_results_)
     cv_results = cv_results.rename(shorten_param, axis=1)
-    param_names = [shorten_param(name) for name in PARAMETER_GRID.keys()]
     labels = {
         "mean_score_time": "CV Score time (s)",
         "mean_test_score": "CV score (accuracy)",
@@ -156,28 +157,10 @@ def train_model(categories):
     return fig, fig2, best_parameters, test_accuracy
-DESCRIPTION_PART1 = [
-    "The dataset used in this example is",
-    "[The 20 newsgroups text dataset](https://scikit-learn.org/stable/datasets/real_world.html#newsgroups-dataset)",
-    "which will be automatically downloaded, cached and reused for the document classification example.",
-]
-DESCRIPTION_PART2 = [
-    "In this example, we tune the hyperparameters of",
-    "a particular classifier using a",
-    "[RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV).",
-    "For a demo on the performance of some other classifiers, see the",
-    "[Classification of text documents using sparse features](https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py) notebook.",
-]
-CATEGORY_SELECTION_DESCRIPTION = [
-    "The task of text classification is easier when there is little overlap between the characteristic terms ",
-    "of different topics. This is because the presence of common terms can make it difficult to distinguish between ",
-    "different topics. On the other hand, when there is little overlap between the characteristic terms of different ",
-    "topics, the task of text classification becomes easier, as the unique terms of each topic provide a solid basis ",
-    "for accurately classifying the document into its respective category. Therefore, careful selection of characteristic",
-    " terms for each topic is crucial to ensure accuracy in text classification."
-]
 AUTHOR = """
 Created by [@dominguesm](https://huggingface.co/dominguesm) based on [scikit-learn docs](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_text_feature_extraction.html)
@@ -188,14 +171,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
     with gr.Row():
         with gr.Column():
             gr.Markdown("# Sample pipeline for text feature extraction and evaluation")
-            gr.Markdown(" ".join(DESCRIPTION_PART1))
-            gr.Markdown(" ".join(DESCRIPTION_PART2))
             gr.Markdown(AUTHOR)
     with gr.Row():
         with gr.Column():
             gr.Markdown("""## CATEGORY SELECTION""")
-            gr.Markdown("".join(CATEGORY_SELECTION_DESCRIPTION))
             drop_categories = gr.Dropdown(
                 CATEGORIES,
                 value=["alt.atheism", "talk.religion.misc"],
@@ -207,20 +190,70 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
             )
     with gr.Row():
         with gr.Column():
-            gr.Markdown(
-                """
-            ## PARAMETERS GRID
-            ```python
-            {
-                'clf__alpha': array(
-                    [1.e-06, 1.e-05, 1.e-04,...]
-                ),
-                'vect__max_df': (0.2, 0.4, 0.6, 0.8, 1.0),
-                'vect__min_df': (1, 3, 5, 10),
-                'vect__ngram_range': ((1, 1), (1, 2)),
-                'vect__norm': ('l1', 'l2')
-            }
-            ```
             ## MODEL PIPELINE
             ```python
             pipeline = Pipeline(
@@ -231,7 +264,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
             )
             ```
             """
-            )
     with gr.Row():
         with gr.Column():
             gr.Markdown("""## TRAINING""")
@@ -248,7 +281,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
     brn_train.click(
         train_model,
-        [drop_categories],
         [plot_trade, plot_coordinates, best_parameters, test_accuracy],
     )

 ]
 def shorten_param(param_name):
     """Remove components' prefixes in param_name."""
     if "__" in param_name:
     return param_name
+def train_model(categories, vect__max_df, vect__min_df, vect__ngram_range, vect__norm):
     pipeline = Pipeline(
         [
             ("vect", TfidfVectorizer()),
         ]
     )
+    parameters_grid = {
+        "vect__max_df": [eval(value) for value in vect__max_df.split(",")],
+        "vect__min_df": [eval(value) for value in vect__min_df.split(",")],
+        "vect__ngram_range": eval(vect__ngram_range),  # unigrams or bigrams
+        "vect__norm": [value.strip() for value in vect__norm.split(",")],
+        "clf__alpha": np.logspace(-6, 6, 13),
+    }
+    print(parameters_grid)
     data_train = fetch_20newsgroups(
         subset="train",
         categories=categories,
     random_search = RandomizedSearchCV(
         estimator=pipeline,
+        param_distributions=parameters_grid,
         n_iter=40,
         random_state=0,
         n_jobs=2,
     cv_results = pd.DataFrame(random_search.cv_results_)
     cv_results = cv_results.rename(shorten_param, axis=1)
+    param_names = [shorten_param(name) for name in parameters_grid.keys()]
     labels = {
         "mean_score_time": "CV Score time (s)",
         "mean_test_score": "CV score (accuracy)",
     return fig, fig2, best_parameters, test_accuracy
+def load_description(name):
+    with open(f"./descriptions/{name}.md", "r") as f:
+        return f.read()
 AUTHOR = """
 Created by [@dominguesm](https://huggingface.co/dominguesm) based on [scikit-learn docs](https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_text_feature_extraction.html)
     with gr.Row():
         with gr.Column():
             gr.Markdown("# Sample pipeline for text feature extraction and evaluation")
+            gr.Markdown(load_description("description_part1"))
+            gr.Markdown(load_description("description_part2"))
             gr.Markdown(AUTHOR)
     with gr.Row():
         with gr.Column():
             gr.Markdown("""## CATEGORY SELECTION""")
+            gr.Markdown(load_description("description_category_selection"))
             drop_categories = gr.Dropdown(
                 CATEGORIES,
                 value=["alt.atheism", "talk.religion.misc"],
             )
     with gr.Row():
         with gr.Column():
+            gr.Markdown("""## PARAMETERS GRID""")
+            gr.Markdown(load_description("description_parameter_grid"))
+            with gr.Column():
+                gr.Markdown("""### Classifier Alpha""")
+                gr.Markdown(load_description("parameter_grid/alpha"))
+                clf__alpha = gr.Textbox(
+                    label="clf__alpha",
+                    value="1.e-06, 1.e-05, 1.e-04",
+                    info="Due to practical considerations, this parameter was kept constant.",
+                    interactive=False,
+                )
+            with gr.Column():
+                gr.Markdown("""### Vectorizer max_df""")
+                gr.Markdown(load_description("parameter_grid/max_df"))
+                vect__max_df = gr.Textbox(
+                    label="vect__max_df",
+                    value="0.2, 0.4, 0.6, 0.8, 1.0",
+                    info="Values ranging from 0 to 1.0, separated by a comma.",
+                    interactive=True,
+                )
+            with gr.Column():
+                gr.Markdown("""### Vectorizer min_df""")
+                gr.Markdown(load_description("parameter_grid/min_df"))
+                vect__min_df = gr.Textbox(
+                    label="vect__min_df",
+                    value="1, 3, 5, 10",
+                    info="Values ranging from 0 to 1.0, separated by a comma, or integers separated by a comma. If float, the parameter represents a proportion of documents, integer absolute counts.",
+                    interactive=True,
+                )
+            with gr.Column():
+                gr.Markdown("""### Vectorizer ngram_range""")
+                gr.Markdown(load_description("parameter_grid/ngram_range"))
+                vect__ngram_range = gr.Textbox(
+                    label="vect__ngram_range",
+                    value="(1, 1), (1, 2)",
+                    info="""Tuples of integer values separated by a comma. For example an ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means only bigrams.""",
+                    interactive=True,
+                )
+            with gr.Column():
+                gr.Markdown("""### Vectorizer norm""")
+                gr.Markdown(load_description("parameter_grid/norm"))
+                gr.Markdown(
+                    """- 'l2': Sum of squares of vector elements is 1. The cosine
+                            similarity between two vectors is their dot product when l2 norm has
+                            been applied.
+                            - 'l1': Sum of absolute values of vector elements is 1."""
+                )
+                vect__norm = gr.Textbox(
+                    label="vect__norm",
+                    value="l1, l2",
+                    info="'l1' or 'l2', separated by a comma",
+                    interactive=True,
+                )
+    with gr.Row():
+        gr.Markdown(
+            """
             ## MODEL PIPELINE
             ```python
             pipeline = Pipeline(
             )
             ```
             """
+        )
     with gr.Row():
         with gr.Column():
             gr.Markdown("""## TRAINING""")
     brn_train.click(
         train_model,
+        [drop_categories, vect__max_df, vect__min_df, vect__ngram_range, vect__norm],
         [plot_trade, plot_coordinates, best_parameters, test_accuracy],
     )