ncoop57 commited on
Commit
cf5eed6
1 Parent(s): 4c20fbb

Made more readyable and added plots

Browse files
Files changed (1) hide show
  1. app.py +20 -24
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import matplotlib.pyplot as plt
3
  import numpy as np
 
4
 
5
  # ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code")
6
  # amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS")
@@ -182,57 +183,52 @@ dataset_data = {
182
  },
183
  }
184
 
185
- def plt_plot(threshold, x):
186
- # prepare some data for a histogram
187
- # x = np.random.randn(1000)
 
188
  # create a figure
189
  fig = plt.figure()
190
  # add a subplot
191
  ax = fig.add_subplot(111)
192
- # plot some data
193
- ax.hist(x, bins=50)
194
  # plot red dashed line at threshold
195
  ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2)
196
- plt.title("Histogram of random data")
 
 
197
  plt.xlabel("Value")
198
  plt.ylabel("Frequency")
 
 
199
  return fig
200
- # x = ["Math", "Business", "Statistics", "IT", "Commerce"]
201
- # y = [68, 73, 82, 74, 85]
202
- # # create a new plot
203
- # plt.rcParams['figure.figsize'] = 6,4
204
- # fig = plt.figure()
205
- # ax = fig.add_axes([0,0,1,1])
206
- # ax.bar(x, y)
207
- # plot red dashed line at threshold
208
- # plt.axhline(y=threshold, color='r', linestyle='--')
209
- # plt.title("Marks per subject")
210
- # plt.xlabel("Subject")
211
- # plt.ylabel("Score")
212
-
213
- # return fig
214
 
215
  with gr.Blocks() as demo:
216
- dataset = gr.Radio(list(dataset_data.keys()), label="Dataset")
 
217
 
218
  with gr.Tab("Character Repetition Ratio"):
219
  # plot some random data
220
  plot = gr.Plot()
221
  threshold = gr.Slider(minimum=0, maximum=100, label="Threshold")
222
  calculate = gr.Button("Calculate")
223
- calculate.click(plt_plot, [threshold, dataset_data[dataset].char_rep_ratios], plot)
 
224
 
225
  with gr.Tab("Word Repetition Ratio"):# plot some random data
226
  plot = gr.Plot()
227
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
228
  calculate = gr.Button("Calculate")
229
- calculate.click(plt_plot, [threshold, dataset_data[dataset].word_rep_ratios], plot)
 
230
 
231
  with gr.Tab("Flagged Word Ratio"):# plot some random data
232
  plot = gr.Plot()
233
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
234
  calculate = gr.Button("Calculate")
235
- calculate.click(plt_plot, [threshold, dataset_data[dataset].flagged_word_ratios], plot)
 
236
 
237
  if __name__ == "__main__":
238
  demo.launch(share=True)
 
1
  import gradio as gr
2
  import matplotlib.pyplot as plt
3
  import numpy as np
4
+ from functools import partial
5
 
6
  # ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code")
7
  # amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS")
 
183
  },
184
  }
185
 
186
+ def plt_plot(ratio, dataset, threshold):
187
+ x = dataset_data[dataset][ratio]
188
+ # calculate percentage of data that will be removed given threshold
189
+ perc = np.sum(x < threshold) / len(x)
190
  # create a figure
191
  fig = plt.figure()
192
  # add a subplot
193
  ax = fig.add_subplot(111)
194
+ # plot some data using black
195
+ ax.hist(x, bins=50, color="black")
196
  # plot red dashed line at threshold
197
  ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2)
198
+ # set title
199
+ # add percentage of data removed
200
+ ax.set_title(f"{dataset} (removed {perc:.2%})")
201
  plt.xlabel("Value")
202
  plt.ylabel("Frequency")
203
+ # make it look nice
204
+ plt.tight_layout()
205
  return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  with gr.Blocks() as demo:
208
+ dataset = gr.Radio(list(dataset_data.keys()), label="Dataset", value="arXiv")
209
+ print(dataset.value)
210
 
211
  with gr.Tab("Character Repetition Ratio"):
212
  # plot some random data
213
  plot = gr.Plot()
214
  threshold = gr.Slider(minimum=0, maximum=100, label="Threshold")
215
  calculate = gr.Button("Calculate")
216
+ plot_fn = partial(plt_plot, "word_rep_ratios")
217
+ calculate.click(plot_fn, [dataset, threshold], plot)
218
 
219
  with gr.Tab("Word Repetition Ratio"):# plot some random data
220
  plot = gr.Plot()
221
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
222
  calculate = gr.Button("Calculate")
223
+ plot_fn = partial(plt_plot, "char_rep_ratios")
224
+ calculate.click(plot_fn, [dataset, threshold], plot)
225
 
226
  with gr.Tab("Flagged Word Ratio"):# plot some random data
227
  plot = gr.Plot()
228
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
229
  calculate = gr.Button("Calculate")
230
+ plot_fn = partial(plt_plot, "flagged_word_ratios")
231
+ calculate.click(plot_fn, [dataset, threshold], plot)
232
 
233
  if __name__ == "__main__":
234
  demo.launch(share=True)