Spaces:

polinaeterna
/

text_quality_checker

Running on Zero

App Files Files Community

polinaeterna HF staff commited on Sep 9

Commit

b1d4b4a

•

1 Parent(s): 0a44dc6

fix

Browse files

Files changed (1) hide show

app.py +20 -12

app.py CHANGED Viewed

@@ -103,7 +103,7 @@ def run_quality_check(dataset, column, batch_size, num_examples):
         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
-        yield {"check in progress...": min(i+batch_size, num_examples) / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
     # with multiprocessing.Pool(processes=8) as pool:
     #     props = pool.map(proportion_non_ascii, texts)
@@ -130,22 +130,21 @@ def plot_toxicity(scores):
     fig, axs = plt.subplots(2, 3)#, figsize=(10, 6))
     for x, y, score_name in zip([0,0,0,1,1,1], [0,1,2,0,1,2], scores):
         axs[x,y].hist(scores[score_name], bins=20, range=(0., 1.))
-        # axs[x,y].set_title(f'Histogram of {score_name}')
-        axs[x,y].set_xlabel(f'{score_name}')
-        # axs[x,y].set_ylabel('Number of texts')
     fig.supylabel("Number of texts")
     fig.suptitle("Histogram of toxicity scores")
     fig.tight_layout()
     return fig
-def call_perspective_api(texts_df, column_name):#, s):
     headers = {
         "content-type": "application/json",
     }
     req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
-    texts = texts_df[column_name].values
     n_samples = len(texts)
     for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
         data = {
@@ -184,7 +183,8 @@ def call_perspective_api(texts_df, column_name):#, s):
                 return req_att_scores
         if i % 10 == 0:
             plot_toxicity(req_att_scores)
-            yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts[:i], **req_att_scores})
     plot_toxicity(req_att_scores)
     yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
@@ -224,6 +224,7 @@ with gr.Blocks() as demo:
         """
         # 💫 Dataset Quality Checker 💫
         Use [nvidia/quality-classifier-deberta](https://huggingface.co/nvidia/quality-classifier-deberta) on any text dataset on the Hub.
         """
     )
     dataset_name = HuggingfaceHubSearch(
@@ -247,6 +248,8 @@ with gr.Blocks() as demo:
             return gr.HTML(value=html_code)
     text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
     batch_size = gr.Slider(0, 128, 32, step=8, label="Inference batch size (set this to smaller value if this space crashes.)")
     num_examples = gr.Number(500, label="Number of first examples to check")
     gr_check_btn = gr.Button("Check Dataset")
@@ -262,18 +265,23 @@ with gr.Blocks() as demo:
         gr.Markdown("### High")
         df_high = gr.DataFrame()
-    texts_sample_df = gr.DataFrame(visible=False)
     gr_check_btn.click(
         run_quality_check,
         inputs=[dataset_name, text_column, batch_size, num_examples],
-        outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_sample_df]
     )
-    gr_ascii_btn = gr.Button("Non ascii chars.")
     non_ascii_hist = gr.Plot()
-    gr_ascii_btn.click(non_ascii_check, inputs=[texts_sample_df, text_column], outputs=[non_ascii_hist])
     gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")
     toxicity_progress_bar = gr.Label(show_label=False)
     toxicity_hist = gr.Plot()
@@ -281,7 +289,7 @@ with gr.Blocks() as demo:
         toxicity_df = gr.DataFrame()
     gr_toxicity_btn.click(
         call_perspective_api,
-        inputs=[texts_sample_df, text_column],
         outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
     )

         batch_predictions = predict(batch_texts)
         predictions.extend(batch_predictions)
         texts_processed.extend(batch_texts)
+        yield {"check in progress...": i / num_examples}, *plot_and_df(texts_processed, predictions), pd.DataFrame()
     # with multiprocessing.Pool(processes=8) as pool:
     #     props = pool.map(proportion_non_ascii, texts)
     fig, axs = plt.subplots(2, 3)#, figsize=(10, 6))
     for x, y, score_name in zip([0,0,0,1,1,1], [0,1,2,0,1,2], scores):
         axs[x,y].hist(scores[score_name], bins=20, range=(0., 1.))
+        axs[x,y].set_xlabel(score_name)
     fig.supylabel("Number of texts")
     fig.suptitle("Histogram of toxicity scores")
     fig.tight_layout()
     return fig
+def call_perspective_api(texts_df, column_name, full_check=False):
     headers = {
         "content-type": "application/json",
     }
     req_att_scores = {attr: [] for attr in REQUESTED_ATTRIBUTES}
+    texts = texts_df.sample(100, random_state=16)[column_name].values if not full_check else texts_df[column_name].values
     n_samples = len(texts)
     for i, text in tqdm(enumerate(texts), desc="scanning with perspective"):
         data = {
                 return req_att_scores
         if i % 10 == 0:
             plot_toxicity(req_att_scores)
+            print(len(texts[:i]), len(req_att_scores["TOXICITY"]))
+            yield {"toxicity check in progress...": i / n_samples}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts[:i+1], **req_att_scores})
     plot_toxicity(req_att_scores)
     yield {"toxicity check finished.": 1.}, plt.gcf(), pd.DataFrame.from_dict({column_name: texts, **req_att_scores})
         """
         # 💫 Dataset Quality Checker 💫
         Use [nvidia/quality-classifier-deberta](https://huggingface.co/nvidia/quality-classifier-deberta) on any text dataset on the Hub.
+        ## Select dataset and text column
         """
     )
     dataset_name = HuggingfaceHubSearch(
             return gr.HTML(value=html_code)
     text_column = gr.Textbox(placeholder="text", label="Text colum name to check (data must be non-nested, raw texts!)")
+    gr.Markdown("## Run nvidia quality classifier")
     batch_size = gr.Slider(0, 128, 32, step=8, label="Inference batch size (set this to smaller value if this space crashes.)")
     num_examples = gr.Number(500, label="Number of first examples to check")
     gr_check_btn = gr.Button("Check Dataset")
         gr.Markdown("### High")
         df_high = gr.DataFrame()
+    texts_df = gr.DataFrame(visible=False)
     gr_check_btn.click(
         run_quality_check,
         inputs=[dataset_name, text_column, batch_size, num_examples],
+        outputs=[progress_bar, plot, df_low, df_medium, df_high, texts_df]
     )
+    gr.Markdown("""## Compute text quality measures
+                * proportion of non-ascii characters
+                * #TODO""")
+    gr_ascii_btn = gr.Button("Data measures")
     non_ascii_hist = gr.Plot()
+    gr_ascii_btn.click(non_ascii_check, inputs=[texts_df, text_column], outputs=[non_ascii_hist])
+    gr.Markdown("## Explore toxicity")
+    checkbox = gr.Checkbox(value=False, label="Run on full first parquet data (better not)")
     gr_toxicity_btn = gr.Button("Run perpspective API to check toxicity of random samples.")
     toxicity_progress_bar = gr.Label(show_label=False)
     toxicity_hist = gr.Plot()
         toxicity_df = gr.DataFrame()
     gr_toxicity_btn.click(
         call_perspective_api,
+        inputs=[texts_df, text_column, checkbox],
         outputs=[toxicity_progress_bar, toxicity_hist, toxicity_df]
     )