ncoop57 commited on
Commit
e0be252
1 Parent(s): 4671e61

Add ability to check examples that would be filtered

Browse files
Files changed (1) hide show
  1. app.py +31 -13
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import os
 
2
  import gradio as gr
3
  import matplotlib.pyplot as plt
4
  import numpy as np
5
  from functools import partial
6
  from datasets import load_dataset
7
- from pathlib import Path
8
 
9
  dataset_names = [
10
  "AI4Code",
@@ -43,13 +43,13 @@ for name in dataset_names:
43
  dataset_data[name] = {
44
  "ds": ds,
45
  "word_rep_ratios": np.random.randn(len(ds)),
46
- "char_rep_ratios": np.array(ds["check_char_repetition_criteria"]),
47
- "flagged_word_ratios": np.array(ds["check_flagged_words_criteria"]),
48
  }
49
 
50
- def plt_plot(ratio, dataset, threshold):
51
  plt.close("all")
52
- x = dataset_data[dataset][ratio]
53
  # calculate percentage of data that will be removed given threshold
54
  perc = np.sum(x > threshold) / len(x)
55
  # create a figure
@@ -69,35 +69,53 @@ def plt_plot(ratio, dataset, threshold):
69
  plt.tight_layout()
70
  return fig
71
 
72
- def check_filtered():
73
- ...
 
 
 
 
 
 
 
 
74
 
75
  with gr.Blocks() as demo:
76
  dataset = gr.Radio(dataset_names, label="Dataset", value="arXiv")
77
- print(dataset.value)
78
 
79
- with gr.Tab("Character Repetition Ratio"):
80
  # plot some random data
81
  plot = gr.Plot()
82
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
83
  calculate = gr.Button("Calculate")
84
  check = gr.Button("Check Filtered Data")
85
- plot_fn = partial(plt_plot, "char_rep_ratios")
 
86
  calculate.click(plot_fn, [dataset, threshold], plot)
 
 
87
 
88
- with gr.Tab("Word Repetition Ratio"):# plot some random data
89
  plot = gr.Plot()
90
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
91
  calculate = gr.Button("Calculate")
 
 
92
  plot_fn = partial(plt_plot, "word_rep_ratios")
93
  calculate.click(plot_fn, [dataset, threshold], plot)
 
 
94
 
95
- with gr.Tab("Flagged Word Ratio"):# plot some random data
96
  plot = gr.Plot()
97
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
98
  calculate = gr.Button("Calculate")
99
- plot_fn = partial(plt_plot, "flagged_word_ratios")
 
 
100
  calculate.click(plot_fn, [dataset, threshold], plot)
 
 
101
 
102
  if __name__ == "__main__":
103
  demo.launch()
 
1
  import os
2
+ import random
3
  import gradio as gr
4
  import matplotlib.pyplot as plt
5
  import numpy as np
6
  from functools import partial
7
  from datasets import load_dataset
 
8
 
9
  dataset_names = [
10
  "AI4Code",
 
43
  dataset_data[name] = {
44
  "ds": ds,
45
  "word_rep_ratios": np.random.randn(len(ds)),
46
+ "check_char_repetition_criteria": np.array(ds["check_char_repetition_criteria"]),
47
+ "check_flagged_words_criteria": np.array(ds["check_flagged_words_criteria"]),
48
  }
49
 
50
+ def plt_plot(criteria, dataset, threshold):
51
  plt.close("all")
52
+ x = dataset_data[dataset][criteria]
53
  # calculate percentage of data that will be removed given threshold
54
  perc = np.sum(x > threshold) / len(x)
55
  # create a figure
 
69
  plt.tight_layout()
70
  return fig
71
 
72
+ def check_filtered(criteria, dataset, threshold):
73
+ ds = dataset_data[dataset]["ds"]
74
+
75
+ filtered_ds = ds.filter(lambda x: x[criteria] > threshold)
76
+ if len(filtered_ds) == 0:
77
+ return "No examples found"
78
+ # get random sample of 1
79
+ sample = filtered_ds.select([random.randint(0, len(filtered_ds) - 1)])["text"][0]
80
+
81
+ return sample
82
 
83
  with gr.Blocks() as demo:
84
  dataset = gr.Radio(dataset_names, label="Dataset", value="arXiv")
 
85
 
86
+ with gr.Tab("Character Repetition Criteria"):
87
  # plot some random data
88
  plot = gr.Plot()
89
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
90
  calculate = gr.Button("Calculate")
91
  check = gr.Button("Check Filtered Data")
92
+ filtered_data = gr.Textbox(lines=5, label="Filtered Data")
93
+ plot_fn = partial(plt_plot, "check_char_repetition_criteria")
94
  calculate.click(plot_fn, [dataset, threshold], plot)
95
+ check_fn = partial(check_filtered, "check_char_repetition_criteria")
96
+ check.click(check_fn, [dataset, threshold], filtered_data)
97
 
98
+ with gr.Tab("Word Repetition Criteria"):# plot some random data
99
  plot = gr.Plot()
100
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
101
  calculate = gr.Button("Calculate")
102
+ check = gr.Button("Check Filtered Data")
103
+ filtered_data = gr.Textbox(lines=5, label="Filtered Data")
104
  plot_fn = partial(plt_plot, "word_rep_ratios")
105
  calculate.click(plot_fn, [dataset, threshold], plot)
106
+ check_fn = partial(check_filtered, "word_rep_ratios")
107
+ check.click(check_fn, [dataset, threshold], filtered_data)
108
 
109
+ with gr.Tab("Flagged Word Criteria"):# plot some random data
110
  plot = gr.Plot()
111
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
112
  calculate = gr.Button("Calculate")
113
+ check = gr.Button("Check Filtered Data")
114
+ filtered_data = gr.Textbox(lines=5, label="Filtered Data")
115
+ plot_fn = partial(plt_plot, "check_flagged_words_criteria")
116
  calculate.click(plot_fn, [dataset, threshold], plot)
117
+ check_fn = partial(check_filtered, "check_flagged_words_criteria")
118
+ check.click(check_fn, [dataset, threshold], filtered_data)
119
 
120
  if __name__ == "__main__":
121
  demo.launch()