Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna commited on Aug 29, 2024

Commit

d806dcd

1 Parent(s): e5960a0

add progress bar

Browse files

Files changed (1) hide show

app.py +52 -25

app.py CHANGED Viewed

@@ -2,11 +2,13 @@ import gradio as gr
 import polars as pl
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 import torch
-import spaces
 from torch import nn
 from transformers import AutoModel, AutoTokenizer, AutoConfig
 from huggingface_hub import PyTorchModelHubMixin
 import pandas as pd
 class QualityModel(nn.Module, PyTorchModelHubMixin):
@@ -31,7 +33,7 @@ model = QualityModel.from_pretrained("nvidia/quality-classifier-deberta").to(dev
 model.eval()
-@spaces.GPU
 def predict(texts: list[str]):
     inputs = tokenizer(
         texts, return_tensors="pt", padding="longest", truncation=True
@@ -44,12 +46,23 @@ def predict(texts: list[str]):
     return predicted_domains
 def plot_and_df(texts, preds):
     texts_df = pd.DataFrame({"quality": preds, "text": texts})
-    counts = pd.DataFrame({"quality": preds}).value_counts().to_frame()
-    counts.reset_index(inplace=True)
     return (
-            gr.BarPlot(counts, x="quality", y="count"),
             texts_df[texts_df["quality"] == "Low"][:20],
             texts_df[texts_df["quality"] == "Medium"][:20],
             texts_df[texts_df["quality"] == "High"][:20],
@@ -62,42 +75,56 @@ def run_quality_check(dataset, column, batch_size, num_examples):
     texts = data[column].to_list()
     # batch_size = 100
     predictions, texts_processed = [], []
-    for i in range(0, min(len(texts), num_examples), batch_size):
         batch_texts = texts[i:i+batch_size]
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
-        yield plot_and_df(texts_processed, predictions)
 with gr.Blocks() as demo:
-    gr.Markdown("# 💫 Dataset Quality Checker 💫")
     dataset_name = HuggingfaceHubSearch(
             label="Hub Dataset ID",
             placeholder="Search for dataset id on Huggingface",
             search_type="dataset",
-            value="fka/awesome-chatgpt-prompts",
         )
     # config_name = "default"  # TODO: user input
-    @gr.render(inputs=dataset_name)
-    def embed(name):
-        html_code = f"""
-        <iframe
-          src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
-          frameborder="0"
-          width="100%"
-          height="700px"
-        ></iframe>
-            """
-        return gr.HTML(value=html_code)
     text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
-    batch_size = gr.Number(100, label="Batch size")
-    num_examples = gr.Number(1000, label="Num examples to check")
     gr_check_btn = gr.Button("Check Dataset")
     plot = gr.BarPlot()
     with gr.Accordion("Explore some individual examples for each class", open=False):
-        df_low, df_medium, df_high = gr.DataFrame(), gr.DataFrame(), gr.DataFrame()
-    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[plot, df_low, df_medium, df_high])
 demo.launch()

 import polars as pl
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 import torch
+from holoviews.ipython.widgets import progress
+# import spaces
 from torch import nn
 from transformers import AutoModel, AutoTokenizer, AutoConfig
 from huggingface_hub import PyTorchModelHubMixin
 import pandas as pd
+from collections import Counter
 class QualityModel(nn.Module, PyTorchModelHubMixin):
 model.eval()
+# @spaces.GPU
 def predict(texts: list[str]):
     inputs = tokenizer(
         texts, return_tensors="pt", padding="longest", truncation=True
     return predicted_domains
+# def progress():
+#     title = f"Scan finished" if num_rows == next_row_idx else "Scan in progress..."
 def plot_and_df(texts, preds):
     texts_df = pd.DataFrame({"quality": preds, "text": texts})
+    counts = Counter(preds)
+    counts_df = pd.DataFrame(
+        {
+            "quality": ["Low", "Medium", "High"],
+            "count": [counts.get("Low", 0), counts.get("Medium", 0), counts.get("High", 0)]
+        }
+    )
+    # counts.reset_index(inplace=True)
     return (
+            gr.BarPlot(counts_df, x="quality", y="count"),
             texts_df[texts_df["quality"] == "Low"][:20],
             texts_df[texts_df["quality"] == "Medium"][:20],
             texts_df[texts_df["quality"] == "High"][:20],
     texts = data[column].to_list()
     # batch_size = 100
     predictions, texts_processed = [], []
+    num_examples = min(len(texts), num_examples)
+    for i in range(0, num_examples, batch_size):
         batch_texts = texts[i:i+batch_size]
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
+        yield {"scan in progress...": (i+batch_size) / num_examples}, *plot_and_df(texts_processed, predictions)
+    yield {"finished": 1.}, *plot_and_df(texts_processed, predictions)
 with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # 💫 Dataset Quality Checker 💫
+        Use [nvidia/quality-classifier-deberta](https://huggingface.co/nvidia/quality-classifier-deberta) on any text dataset on the Hub.
+        """
+    )
     dataset_name = HuggingfaceHubSearch(
             label="Hub Dataset ID",
             placeholder="Search for dataset id on Huggingface",
             search_type="dataset",
+            # value="fka/awesome-chatgpt-prompts",
         )
     # config_name = "default"  # TODO: user input
+    with gr.Accordion("Dataset preview", open=False):
+        @gr.render(inputs=dataset_name)
+        def embed(name):
+            html_code = f"""
+            <iframe
+              src="https://huggingface.co/datasets/{name}/embed/viewer/default/train"
+              frameborder="0"
+              width="100%"
+              height="700px"
+            ></iframe>
+                """
+            return gr.HTML(value=html_code)
     text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
+    batch_size = gr.Slider(0, 128, 64, step=8, label="Batch size (set this to smaller value if this space crashes.)")
+    num_examples = gr.Number(1000, label="Number of first examples to check")
     gr_check_btn = gr.Button("Check Dataset")
+    progress_bar = gr.Label(show_label=False)
     plot = gr.BarPlot()
     with gr.Accordion("Explore some individual examples for each class", open=False):
+        gr.Markdown("### Low")
+        df_low = gr.DataFrame()
+        gr.Markdown("### Medium")
+        df_medium = gr.DataFrame()
+        gr.Markdown("### High")
+        df_high = gr.DataFrame()
+    gr_check_btn.click(run_quality_check, inputs=[dataset_name, text_column, batch_size, num_examples], outputs=[progress_bar, plot, df_low, df_medium, df_high])
 demo.launch()