giskard-evaluator

Running

App Files Files Community

200

ZeroCommand commited on Feb 8, 2024

Commit

0607989

verified ·

1 Parent(s): 5f9a95f

GSK-2774-GSK-2771-GSK-2772 (#101)

Browse files

- remove overused warnings & fix wording & prevent un-matchable models and datasets submissions (f9983aba4d3aaf2e17a17669a4086819e65c09ae)
- add job id and rephrase (ed207aeeb43f12280829553f761cc837273da1ac)
- fix bypassing validation possibility (1ead652bc86135baf9fa7b42b391b649a951960a)
- add trust remote code param for dataset with scripts (52ba35194688f68a903fc477676209e2d3aa2708)
- add trust remote code to get dataset config names (4b5940140b89343e08e95bbe6ce2bb7f0b4c753b)
- add persistent error code when number of labels not matching (c680d9a2f682bf992e6753ec3aebb2ffe9938de3)
- add wording for guiding user to find the report (8a71b006571950be12c658575f5633127dc6fd9d)
- add hf token validation (346fe42776f7ce2da20956a78170c6d81f1820fd)
- add error msg for token invalid (20294008e5d228049c136e3fe9013feb27694bec)
- change hf token valid wording style (0c7a6488ba03aa0b9768118cc9c1e71865448e8c)
- wrap hf dataset error (55c122a303573ff10da3b52213a1158c7e7fc66e)

Files changed (6) hide show

app_leaderboard.py +1 -1
app_text_classification.py +33 -6
fetch_utils.py +2 -2
text_classification.py +17 -6
text_classification_ui_helpers.py +31 -16
wordings.py +23 -4

app_leaderboard.py CHANGED Viewed

@@ -21,7 +21,7 @@ def get_records_from_dataset_repo(dataset_id):
     logger.info(f"Dataset {dataset_id} has splits {dataset_split}")
     try:
-        ds = datasets.load_dataset(dataset_id, dataset_config[0])[dataset_split[0]]
         df = ds.to_pandas()
         return df
     except Exception as e:

     logger.info(f"Dataset {dataset_id} has splits {dataset_split}")
     try:
+        ds = datasets.load_dataset(dataset_id, dataset_config[0], split=dataset_split[0])
         df = ds.to_pandas()
         return df
     except Exception as e:

app_text_classification.py CHANGED Viewed

@@ -2,7 +2,7 @@ import uuid
 import gradio as gr
-from io_utils import get_logs_file, read_scanners, write_scanners
 from text_classification_ui_helpers import (
     get_related_datasets_from_leaderboard,
     align_columns_and_show_prediction,
@@ -11,7 +11,19 @@ from text_classification_ui_helpers import (
     try_submit,
     write_column_mapping_to_config,
 )
-from wordings import CONFIRM_MAPPING_DETAILS_MD, INTRODUCTION_MD, USE_INFERENCE_API_TIP
 MAX_LABELS = 40
 MAX_FEATURES = 20
@@ -89,6 +101,13 @@ def get_demo():
             visible=True,
             interactive=True,
         )
     with gr.Accordion(label="Scanner Advance Config (optional)", open=False):
         scanners = gr.CheckboxGroup(label="Scan Settings", visible=True)
@@ -96,7 +115,7 @@ def get_demo():
         @gr.on(triggers=[uid_label.change], inputs=[uid_label], outputs=[scanners])
         def get_scanners(uid):
             selected = read_scanners(uid)
-            # currently we remove data_leakage from the default scanners
             # Reason: data_leakage barely raises any issues and takes too many requests
             # when using inference API, causing rate limit error
             scan_config = selected + ["data_leakage"]
@@ -114,8 +133,8 @@ def get_demo():
     with gr.Row():
         logs = gr.Textbox(
-            value=get_logs_file,
-            label="Giskard Bot Evaluation Log:",
             visible=False,
             every=0.5,
         )
@@ -135,7 +154,7 @@ def get_demo():
     )
     gr.on(
-        triggers=[dataset_id_input.input],
         fn=check_dataset,
         inputs=[dataset_id_input],
         outputs=[dataset_config_input, dataset_split_input, loading_status]
@@ -223,6 +242,14 @@ def get_demo():
             return gr.update(interactive=False)
         if model_id == "" or dataset_id == "" or dataset_config == "" or dataset_split == "":
             return gr.update(interactive=False)
         return gr.update(interactive=True)
     gr.on(

 import gradio as gr
+from io_utils import read_scanners, write_scanners
 from text_classification_ui_helpers import (
     get_related_datasets_from_leaderboard,
     align_columns_and_show_prediction,
     try_submit,
     write_column_mapping_to_config,
 )
+from text_classification import (
+  get_example_prediction,
+  check_hf_token_validity,
+  HuggingFaceInferenceAPIResponse
+)
+from wordings import (
+  CONFIRM_MAPPING_DETAILS_MD,
+  INTRODUCTION_MD,
+  USE_INFERENCE_API_TIP,
+  CHECK_LOG_SECTION_RAW,
+  HF_TOKEN_INVALID_STYLED
+)
 MAX_LABELS = 40
 MAX_FEATURES = 20
             visible=True,
             interactive=True,
         )
+        inference_token_info = gr.HTML(value=HF_TOKEN_INVALID_STYLED, visible=False)
+        inference_token.change(
+            lambda token: gr.update(visible=lambda: check_hf_token_validity(token)),
+            inputs=[inference_token],
+            outputs=[inference_token_info],
+        )
     with gr.Accordion(label="Scanner Advance Config (optional)", open=False):
         scanners = gr.CheckboxGroup(label="Scan Settings", visible=True)
         @gr.on(triggers=[uid_label.change], inputs=[uid_label], outputs=[scanners])
         def get_scanners(uid):
             selected = read_scanners(uid)
+            # we remove data_leakage from the default scanners
             # Reason: data_leakage barely raises any issues and takes too many requests
             # when using inference API, causing rate limit error
             scan_config = selected + ["data_leakage"]
     with gr.Row():
         logs = gr.Textbox(
+            value=CHECK_LOG_SECTION_RAW,
+            label="Giskard Bot Evaluation Guide:",
             visible=False,
             every=0.5,
         )
     )
     gr.on(
+        triggers=[dataset_id_input.change],
         fn=check_dataset,
         inputs=[dataset_id_input],
         outputs=[dataset_config_input, dataset_split_input, loading_status]
             return gr.update(interactive=False)
         if model_id == "" or dataset_id == "" or dataset_config == "" or dataset_split == "":
             return gr.update(interactive=False)
+        if not column_mapping_accordion.visible:
+            return gr.update(interactive=False)
+        _, prediction_response = get_example_prediction(
+            model_id, dataset_id, dataset_config, dataset_split, inference_token
+        )
+        if not isinstance(prediction_response, HuggingFaceInferenceAPIResponse):
+            gr.warning("Your HF token is invalid. Please check your token.")
+            return gr.update(interactive=False)
         return gr.update(interactive=True)
     gr.on(

fetch_utils.py CHANGED Viewed

@@ -5,7 +5,7 @@ import datasets
 def check_dataset_and_get_config(dataset_id):
     try:
-        configs = datasets.get_dataset_config_names(dataset_id)
         return configs
     except Exception:
         # Dataset may not exist
@@ -14,7 +14,7 @@ def check_dataset_and_get_config(dataset_id):
 def check_dataset_and_get_split(dataset_id, dataset_config):
     try:
-        ds = datasets.load_dataset(dataset_id, dataset_config)
     except Exception as e:
         # Dataset may not exist
         logging.warning(

 def check_dataset_and_get_config(dataset_id):
     try:
+        configs = datasets.get_dataset_config_names(dataset_id, trust_remote_code=True)
         return configs
     except Exception:
         # Dataset may not exist
 def check_dataset_and_get_split(dataset_id, dataset_config):
     try:
+        ds = datasets.load_dataset(dataset_id, dataset_config, trust_remote_code=True)
     except Exception as e:
         # Dataset may not exist
         logging.warning(

text_classification.py CHANGED Viewed

@@ -254,7 +254,7 @@ def infer_output_label_column(
 def check_dataset_features_validity(d_id, config, split):
     # We assume dataset is ok here
-    ds = datasets.load_dataset(d_id, config)[split]
     try:
         dataset_features = ds.features
     except AttributeError:
@@ -272,20 +272,19 @@ def select_the_first_string_column(ds):
     return None
-def get_example_prediction(model_id, dataset_id, dataset_config, dataset_split):
     # get a sample prediction from the model on the dataset
     prediction_input = None
     prediction_result = None
     try:
         # Use the first item to test prediction
-        ds = datasets.load_dataset(dataset_id, dataset_config)[dataset_split]
         if "text" not in ds.features.keys():
             # Dataset does not have text column
             prediction_input = ds[0][select_the_first_string_column(ds)]
         else:
             prediction_input = ds[0]["text"]
-        hf_token = os.environ.get(HF_WRITE_TOKEN, default="")
         payload = {"inputs": prediction_input, "options": {"use_cache": True}}
         results = hf_inference_api(model_id, hf_token, payload)
@@ -381,4 +380,16 @@ def text_classification_fix_column_mapping(column_mapping, ppl, d_id, config, sp
 def strip_model_id_from_url(model_id):
     if model_id.startswith("https://huggingface.co/"):
         return "/".join(model_id.split("/")[-2])
-    return model_id

 def check_dataset_features_validity(d_id, config, split):
     # We assume dataset is ok here
+    ds = datasets.load_dataset(d_id, config, split=split, trust_remote_code=True)
     try:
         dataset_features = ds.features
     except AttributeError:
     return None
+def get_example_prediction(model_id, dataset_id, dataset_config, dataset_split, hf_token):
     # get a sample prediction from the model on the dataset
     prediction_input = None
     prediction_result = None
     try:
         # Use the first item to test prediction
+        ds = datasets.load_dataset(dataset_id, dataset_config, split=dataset_split, trust_remote_code=True)
         if "text" not in ds.features.keys():
             # Dataset does not have text column
             prediction_input = ds[0][select_the_first_string_column(ds)]
         else:
             prediction_input = ds[0]["text"]
         payload = {"inputs": prediction_input, "options": {"use_cache": True}}
         results = hf_inference_api(model_id, hf_token, payload)
 def strip_model_id_from_url(model_id):
     if model_id.startswith("https://huggingface.co/"):
         return "/".join(model_id.split("/")[-2])
+    return model_id
+def check_hf_token_validity(hf_token):
+    if hf_token == "":
+        return False
+    if not isinstance(hf_token, str):
+        return False
+    # use inference api to check the token
+    payload = {"inputs": "This is a test", "options": {"use_cache": True}}
+    response = hf_inference_api("cardiffnlp/twitter-roberta-base-sentiment-latest", hf_token, payload)
+    if "error" in response:
+        return False
+    return True

text_classification_ui_helpers.py CHANGED Viewed

@@ -23,8 +23,12 @@ from wordings import (
     CONFIRM_MAPPING_DETAILS_FAIL_RAW,
     MAPPING_STYLED_ERROR_WARNING,
     NOT_TEXT_CLASSIFICATION_MODEL_RAW,
     get_styled_input,
 )
 MAX_LABELS = 40
 MAX_FEATURES = 20
@@ -41,7 +45,7 @@ def get_related_datasets_from_leaderboard(model_id):
     if len(datasets_unique) == 0:
         return gr.update(choices=[], value="")
-    return gr.update(choices=datasets_unique, value=datasets_unique[0])
 logger = logging.getLogger(__file__)
@@ -50,18 +54,16 @@ logger = logging.getLogger(__file__)
 def check_dataset(dataset_id):
     logger.info(f"Loading {dataset_id}")
     try:
-        configs = datasets.get_dataset_config_names(dataset_id)
         if len(configs) == 0:
             return (
                 gr.update(),
                 gr.update(),
                 ""
             )
-        splits = list(
-                    datasets.load_dataset(
-                        dataset_id, configs[0]
-                    ).keys()
-                )
         return (
             gr.update(choices=configs, value=configs[0], visible=True),
             gr.update(choices=splits, value=splits[0], visible=True),
@@ -69,6 +71,8 @@ def check_dataset(dataset_id):
         )
     except Exception as e:
         logger.warn(f"Check your dataset {dataset_id}: {e}")
         return (
             gr.update(),
             gr.update(),
@@ -174,7 +178,7 @@ def precheck_model_ds_enable_example_btn(
         return (gr.update(), gr.update(), "")
     try:
-        ds = datasets.load_dataset(dataset_id, dataset_config)
         df: pd.DataFrame = ds[dataset_split].to_pandas().head(5)
         ds_labels, ds_features = get_labels_and_features_from_dataset(ds[dataset_split])
@@ -185,7 +189,7 @@ def precheck_model_ds_enable_example_btn(
         return (gr.update(interactive=True), gr.update(value=df, visible=True), "")
     except Exception as e:
         # Config or split wrong
-        gr.Warning(f"Failed to load dataset {dataset_id} with config {dataset_config}: {e}")
         return (gr.update(interactive=False), gr.update(value=pd.DataFrame(), visible=False), "")
@@ -214,9 +218,11 @@ def align_columns_and_show_prediction(
     dropdown_placement = [
         gr.Dropdown(visible=False) for _ in range(MAX_LABELS + MAX_FEATURES)
     ]
     prediction_input, prediction_response = get_example_prediction(
-        model_id, dataset_id, dataset_config, dataset_split
     )
     if prediction_input is None or prediction_response is None:
@@ -241,7 +247,7 @@ def align_columns_and_show_prediction(
     model_labels = list(prediction_response.keys())
-    ds = datasets.load_dataset(dataset_id, dataset_config)[dataset_split]
     ds_labels, ds_features = get_labels_and_features_from_dataset(ds)
     # when dataset does not have labels or features
@@ -255,6 +261,16 @@ def align_columns_and_show_prediction(
             "",
             *dropdown_placement,
         )
     column_mappings = list_labels_and_features_from_dataset(
         ds_labels,
@@ -301,10 +317,10 @@ def check_column_mapping_keys_validity(all_mappings):
 def construct_label_and_feature_mapping(all_mappings, ds_labels, ds_features):
     label_mapping = {}
     if len(all_mappings["labels"].keys()) != len(ds_labels):
-        gr.Warning("Label mapping corrupted: " + CONFIRM_MAPPING_DETAILS_FAIL_RAW)
     if len(all_mappings["features"].keys()) != len(ds_features):
-        gr.Warning("Feature mapping corrupted: " + CONFIRM_MAPPING_DETAILS_FAIL_RAW)
     for i, label in zip(range(len(ds_labels)),  ds_labels):
         # align the saved labels with dataset labels order
@@ -315,13 +331,12 @@ def construct_label_and_feature_mapping(all_mappings, ds_labels, ds_features):
     feature_mapping = all_mappings["features"]
     return label_mapping, feature_mapping
 def try_submit(m_id, d_id, config, split, inference, inference_token, uid):
     all_mappings = read_column_mapping(uid)
     check_column_mapping_keys_validity(all_mappings)
     # get ds labels and features again for alignment
-    ds = datasets.load_dataset(d_id, config)[split]
     ds_labels, ds_features = get_labels_and_features_from_dataset(ds)
     label_mapping, feature_mapping = construct_label_and_feature_mapping(all_mappings, ds_labels, ds_features)
@@ -346,6 +361,6 @@ def try_submit(m_id, d_id, config, split, inference, inference_token, uid):
     return (
         gr.update(interactive=False),  # Submit button
-        gr.update(lines=5, visible=True, interactive=False),
         uuid.uuid4(),  # Allocate a new uuid
     )

     CONFIRM_MAPPING_DETAILS_FAIL_RAW,
     MAPPING_STYLED_ERROR_WARNING,
     NOT_TEXT_CLASSIFICATION_MODEL_RAW,
+    UNMATCHED_MODEL_DATASET_STYLED_ERROR,
+    CHECK_LOG_SECTION_RAW,
     get_styled_input,
+    get_dataset_fetch_error_raw,
 )
+import os
 MAX_LABELS = 40
 MAX_FEATURES = 20
     if len(datasets_unique) == 0:
         return gr.update(choices=[], value="")
+    return gr.update(choices=datasets_unique, value="")
 logger = logging.getLogger(__file__)
 def check_dataset(dataset_id):
     logger.info(f"Loading {dataset_id}")
     try:
+        configs = datasets.get_dataset_config_names(dataset_id, trust_remote_code=True)
         if len(configs) == 0:
             return (
                 gr.update(),
                 gr.update(),
                 ""
             )
+        splits = datasets.get_dataset_split_names(
+                        dataset_id, configs[0], trust_remote_code=True
+                    )
         return (
             gr.update(choices=configs, value=configs[0], visible=True),
             gr.update(choices=splits, value=splits[0], visible=True),
         )
     except Exception as e:
         logger.warn(f"Check your dataset {dataset_id}: {e}")
+        if "forbidden" in str(e).lower(): # GSK-2770
+            gr.warning(get_dataset_fetch_error_raw(e))
         return (
             gr.update(),
             gr.update(),
         return (gr.update(), gr.update(), "")
     try:
+        ds = datasets.load_dataset(dataset_id, dataset_config, trust_remote_code=True)
         df: pd.DataFrame = ds[dataset_split].to_pandas().head(5)
         ds_labels, ds_features = get_labels_and_features_from_dataset(ds[dataset_split])
         return (gr.update(interactive=True), gr.update(value=df, visible=True), "")
     except Exception as e:
         # Config or split wrong
+        logger.warn(f"Check your dataset {dataset_id} and config {dataset_config} on split {dataset_split}: {e}")
         return (gr.update(interactive=False), gr.update(value=pd.DataFrame(), visible=False), "")
     dropdown_placement = [
         gr.Dropdown(visible=False) for _ in range(MAX_LABELS + MAX_FEATURES)
     ]
+    hf_token = os.environ.get("HF_WRITE_TOKEN", default="")
     prediction_input, prediction_response = get_example_prediction(
+        model_id, dataset_id, dataset_config, dataset_split, hf_token
     )
     if prediction_input is None or prediction_response is None:
     model_labels = list(prediction_response.keys())
+    ds = datasets.load_dataset(dataset_id, dataset_config, split=dataset_split, trust_remote_code=True)
     ds_labels, ds_features = get_labels_and_features_from_dataset(ds)
     # when dataset does not have labels or features
             "",
             *dropdown_placement,
         )
+    if len(ds_labels) != len(model_labels):
+        return (
+            gr.update(value=UNMATCHED_MODEL_DATASET_STYLED_ERROR, visible=True),
+            gr.update(visible=False),
+            gr.update(visible=False, open=False),
+            gr.update(interactive=False),
+            "",
+            *dropdown_placement,
+        )
     column_mappings = list_labels_and_features_from_dataset(
         ds_labels,
 def construct_label_and_feature_mapping(all_mappings, ds_labels, ds_features):
     label_mapping = {}
     if len(all_mappings["labels"].keys()) != len(ds_labels):
+        logger.warn("Label mapping corrupted: " + CONFIRM_MAPPING_DETAILS_FAIL_RAW)
     if len(all_mappings["features"].keys()) != len(ds_features):
+        logger.warn("Feature mapping corrupted: " + CONFIRM_MAPPING_DETAILS_FAIL_RAW)
     for i, label in zip(range(len(ds_labels)),  ds_labels):
         # align the saved labels with dataset labels order
     feature_mapping = all_mappings["features"]
     return label_mapping, feature_mapping
 def try_submit(m_id, d_id, config, split, inference, inference_token, uid):
     all_mappings = read_column_mapping(uid)
     check_column_mapping_keys_validity(all_mappings)
     # get ds labels and features again for alignment
+    ds = datasets.load_dataset(d_id, config, split=split, trust_remote_code=True)
     ds_labels, ds_features = get_labels_and_features_from_dataset(ds)
     label_mapping, feature_mapping = construct_label_and_feature_mapping(all_mappings, ds_labels, ds_features)
     return (
         gr.update(interactive=False),  # Submit button
+        gr.update(value=f"{CHECK_LOG_SECTION_RAW}Your job id is: {uid}. ", lines=5, visible=True, interactive=False),
         uuid.uuid4(),  # Allocate a new uuid
     )

wordings.py CHANGED Viewed

@@ -2,7 +2,7 @@ INTRODUCTION_MD = """
                 <h1 style="text-align: center;">
                 🐢Giskard Evaluator
                 </h1>
-                Welcome to Giskard Evaluator Space! Get your report immediately by simply input your model id and dataset id below. Follow our leads and improve your model in no time.
                 """
 CONFIRM_MAPPING_DETAILS_MD = """
                             <h1 style="text-align: center;">
@@ -18,13 +18,17 @@ CONFIRM_MAPPING_DETAILS_FAIL_MD = """
                             """
 CONFIRM_MAPPING_DETAILS_FAIL_RAW = """
-                            Sorry, we cannot align the input/output of your dataset with the model. Pleaser double check your model and dataset.
                             """
 CHECK_CONFIG_OR_SPLIT_RAW = """
                             Please check your dataset config or split.
                             """
 PREDICTION_SAMPLE_MD = """
                         <h1 style="text-align: center;">
                         Model Prediction Sample
@@ -33,11 +37,17 @@ PREDICTION_SAMPLE_MD = """
                         """
 MAPPING_STYLED_ERROR_WARNING = """
-                        <h3 style="text-align: center;color: coral; background-color: #fff0f3; border-radius: 8px; padding: 10px; ">
                         Sorry, we cannot auto-align the labels/features of your dataset and model. Please double check.
                         </h3>
                         """
 NOT_TEXT_CLASSIFICATION_MODEL_RAW = """
                         Your model does not fall under the category of text classification. This page is specifically designated for the evaluation of text classification models.
                       """
@@ -61,7 +71,16 @@ USE_INFERENCE_API_TIP = """
                 </b>
             """
 def get_styled_input(input):
     return f"""<h3 style="text-align: center;color: #4ca154; background-color: #e2fbe8; border-radius: 8px; padding: 10px; ">
-            Sample input: {input}
             </h3>"""

                 <h1 style="text-align: center;">
                 🐢Giskard Evaluator
                 </h1>
+                Welcome to Giskard Evaluator Space! Get your report immediately by simply input your model id and dataset id below. Follow our leads and improve your model.
                 """
 CONFIRM_MAPPING_DETAILS_MD = """
                             <h1 style="text-align: center;">
                             """
 CONFIRM_MAPPING_DETAILS_FAIL_RAW = """
+                            Sorry, we cannot auto-align the input/output of your dataset with the model.
                             """
 CHECK_CONFIG_OR_SPLIT_RAW = """
                             Please check your dataset config or split.
                             """
+CHECK_LOG_SECTION_RAW = """
+                          Your have successfully submitted a Giskard evaluation. Further details are available in the Logs tab. You can find your report will be posted to your model's community discussion.
+                        """
 PREDICTION_SAMPLE_MD = """
                         <h1 style="text-align: center;">
                         Model Prediction Sample
                         """
 MAPPING_STYLED_ERROR_WARNING = """
+                        <h3 style="text-align: center;color: orange; background-color: #fff0f3; border-radius: 8px; padding: 10px; ">
                         Sorry, we cannot auto-align the labels/features of your dataset and model. Please double check.
                         </h3>
                         """
+UNMATCHED_MODEL_DATASET_STYLED_ERROR = """
+                        <h3 style="text-align: center;color: #fa5f5f; background-color: #fbe2e2; border-radius: 8px; padding: 10px; ">
+                        Your model and dataset have different numbers of labels. Please double check your model and dataset.
+                        </h3>
+                        """
 NOT_TEXT_CLASSIFICATION_MODEL_RAW = """
                         Your model does not fall under the category of text classification. This page is specifically designated for the evaluation of text classification models.
                       """
                 </b>
             """
+HF_TOKEN_INVALID_STYLED= """
+                <p style="text-align: left;color: red; ">
+                Your Hugging Face token is invalid. Please double check your token.
+                </p>
+                """
+def get_dataset_fetch_error_raw(error):
+    return f"""Sorry you cannot use this dataset because {error} Contact HF team to support this dataset."""
 def get_styled_input(input):
     return f"""<h3 style="text-align: center;color: #4ca154; background-color: #e2fbe8; border-radius: 8px; padding: 10px; ">
+            Your model and dataset have been validated! <br /> Sample input: {input}
             </h3>"""