ncoop57 commited on
Commit
4b039b3
1 Parent(s): 3e6eddc

Add using real data

Browse files
Files changed (1) hide show
  1. app.py +45 -210
app.py CHANGED
@@ -2,219 +2,54 @@ import gradio as gr
2
  import matplotlib.pyplot as plt
3
  import numpy as np
4
  from functools import partial
5
- import datasets
6
  from datasets import load_dataset
7
 
8
- ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/AI4Code/data.json", use_auth_token=True)
9
- amps_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/AMPS/data.json", use_auth_token=True)
10
- apache_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/ASFPublicMail/data.json", use_auth_token=True)
11
- books3_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/Books3/data.json", use_auth_token=True)
12
- cp_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/CPDataset/data.json", use_auth_token=True)
13
- dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/DMMath/data.json", use_auth_token=True)
14
- discourse_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/Discourse/data.json", use_auth_token=True)
15
- wiki_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/Enwiki/data.json")
16
- euro_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/EuroParliamentProceedings/data.json", use_auth_token=True)
17
- freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/FreeLaw_Options/data.json", use_auth_token=True)
18
- ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/GitHubDiff/data.json", use_auth_token=True)
19
- ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/GitHubIssues/data.json", use_auth_token=True)
20
- gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/Gutenberg/data.json", use_auth_token=True)
21
- leet_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/LeetCode/data.json", use_auth_token=True)
22
- pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/PileOfLaw/data.json", use_auth_token=True)
23
- pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/PubMed/data.json", use_auth_token=True)
24
- s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/S2ORC/data.json", use_auth_token=True)
25
- se_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/StackExchange/data.json", use_auth_token=True)
26
- usenet_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/USENET/data.json", use_auth_token=True)
27
- uspto_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/USPTO/data.json", use_auth_token=True)
28
- ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/UbuntuIRC/data.json", use_auth_token=True)
29
- arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_files="data/arXiv/data.json", use_auth_token=True)
 
30
 
31
-
32
- dataset_data = {
33
- "ai4code" : ai4code_ds["train"],
34
- "amps" : amps_ds["train"],
35
- "apache" : apache_ds["train"],
36
- "books3" : books3_ds["train"],
37
- "competitive_programming" : cp_ds["train"],
38
- "dmmath" : dmmath_ds["train"],
39
- "discourse" : discourse_ds["train"],
40
- "enwiki" : wiki_ds["train"],
41
- "euro" : euro_ds["train"],
42
- "freelaw" : freelaw_ds["train"],
43
- "ghdiffs" : ghdiffs_ds["train"],
44
- "ghissues" : ghissues_ds["train"],
45
- "gutenberg" : gutenberg_ds["train"],
46
- "leetcode" : leet_ds["train"],
47
- "pileoflaw" : pileoflaw_ds["train"],
48
- "pubmed" : pubmed_ds["train"],
49
- "s2orc" : s2orc_ds["train"],
50
- "se" : se_ds["train"],
51
- "usenet" : usenet_ds["train"],
52
- "uspto" : uspto_ds["train"],
53
- "ubuntuirc" : ubuntuirc_ds["train"],
54
- "arxiv" : arxiv_ds["train"]
55
  }
56
 
57
- # dataset_data = {
58
- # "AI4Code": {
59
- # # create fake data for the different ratios
60
- # "word_rep_ratios": np.random.randn(1000),
61
- # "char_rep_ratios": np.random.randn(1000),
62
- # "flagged_word_ratios": np.random.randn(1000),
63
- # "num_words": np.random.randint(0, 1000, 1000),
64
- # },
65
- # "AMPS": {
66
- # # create fake data for the different ratios
67
- # "word_rep_ratios": np.random.randn(1000),
68
- # "char_rep_ratios": np.random.randn(1000),
69
- # "flagged_word_ratios": np.random.randn(1000),
70
- # "num_words": np.random.randint(0, 1000, 1000),
71
- # },
72
- # "ASFPublicMail": {
73
- # # create fake data for the different ratios
74
- # "word_rep_ratios": np.random.randn(1000),
75
- # "char_rep_ratios": np.random.randn(1000),
76
- # "flagged_word_ratios": np.random.randn(1000),
77
- # "num_words": np.random.randint(0, 1000, 1000),
78
- # },
79
- # "Books3": {
80
- # # create fake data for the different ratios
81
- # "word_rep_ratios": np.random.randn(1000),
82
- # "char_rep_ratios": np.random.randn(1000),
83
- # "flagged_word_ratios": np.random.randn(1000),
84
- # "num_words": np.random.randint(0, 1000, 1000),
85
- # },
86
- # "CPDataset": {
87
- # # create fake data for the different ratios
88
- # "word_rep_ratios": np.random.randn(1000),
89
- # "char_rep_ratios": np.random.randn(1000),
90
- # "flagged_word_ratios": np.random.randn(1000),
91
- # "num_words": np.random.randint(0, 1000, 1000),
92
- # },
93
- # "DMMath": {
94
- # # create fake data for the different ratios
95
- # "word_rep_ratios": np.random.randn(1000),
96
- # "char_rep_ratios": np.random.randn(1000),
97
- # "flagged_word_ratios": np.random.randn(1000),
98
- # "num_words": np.random.randint(0, 1000, 1000),
99
- # },
100
- # "Discourse": {
101
- # # create fake data for the different ratios
102
- # "word_rep_ratios": np.random.randn(1000),
103
- # "char_rep_ratios": np.random.randn(1000),
104
- # "flagged_word_ratios": np.random.randn(1000),
105
- # "num_words": np.random.randint(0, 1000, 1000),
106
- # },
107
- # "Enwiki": {
108
- # # create fake data for the different ratios
109
- # "word_rep_ratios": np.random.randn(1000),
110
- # "char_rep_ratios": np.random.randn(1000),
111
- # "flagged_word_ratios": np.random.randn(1000),
112
- # "num_words": np.random.randint(0, 1000, 1000),
113
- # },
114
- # "EuroParliamentProceedings": {
115
- # # create fake data for the different ratios
116
- # "word_rep_ratios": np.random.randn(1000),
117
- # "char_rep_ratios": np.random.randn(1000),
118
- # "flagged_word_ratios": np.random.randn(1000),
119
- # "num_words": np.random.randint(0, 1000, 1000),
120
- # },
121
- # "FreeLaw_Options": {
122
- # # create fake data for the different ratios
123
- # "word_rep_ratios": np.random.randn(1000),
124
- # "char_rep_ratios": np.random.randn(1000),
125
- # "flagged_word_ratios": np.random.randn(1000),
126
- # "num_words": np.random.randint(0, 1000, 1000),
127
- # },
128
- # "GitHubDiff": {
129
- # # create fake data for the different ratios
130
- # "word_rep_ratios": np.random.randn(1000),
131
- # "char_rep_ratios": np.random.randn(1000),
132
- # "flagged_word_ratios": np.random.randn(1000),
133
- # "num_words": np.random.randint(0, 1000, 1000),
134
- # },
135
- # "GitHubIssues": {
136
- # # create fake data for the different ratios
137
- # "word_rep_ratios": np.random.randn(1000),
138
- # "char_rep_ratios": np.random.randn(1000),
139
- # "flagged_word_ratios": np.random.randn(1000),
140
- # "num_words": np.random.randint(0, 1000, 1000),
141
- # },
142
- # "Gutenberg": {
143
- # # create fake data for the different ratios
144
- # "word_rep_ratios": np.random.randn(1000),
145
- # "char_rep_ratios": np.random.randn(1000),
146
- # "flagged_word_ratios": np.random.randn(1000),
147
- # "num_words": np.random.randint(0, 1000, 1000),
148
- # },
149
- # "LeetCode": {
150
- # # create fake data for the different ratios
151
- # "word_rep_ratios": np.random.randn(1000),
152
- # "char_rep_ratios": np.random.randn(1000),
153
- # "flagged_word_ratios": np.random.randn(1000),
154
- # "num_words": np.random.randint(0, 1000, 1000),
155
- # },
156
- # "PileOfLaw": {
157
- # # create fake data for the different ratios
158
- # "word_rep_ratios": np.random.randn(1000),
159
- # "char_rep_ratios": np.random.randn(1000),
160
- # "flagged_word_ratios": np.random.randn(1000),
161
- # "num_words": np.random.randint(0, 1000, 1000),
162
- # },
163
- # "PubMed": {
164
- # # create fake data for the different ratios
165
- # "word_rep_ratios": np.random.randn(1000),
166
- # "char_rep_ratios": np.random.randn(1000),
167
- # "flagged_word_ratios": np.random.randn(1000),
168
- # "num_words": np.random.randint(0, 1000, 1000),
169
- # },
170
- # "S2ORC": {
171
- # # create fake data for the different ratios
172
- # "word_rep_ratios": np.random.randn(1000),
173
- # "char_rep_ratios": np.random.randn(1000),
174
- # "flagged_word_ratios": np.random.randn(1000),
175
- # "num_words": np.random.randint(0, 1000, 1000),
176
- # },
177
- # "StackExchange": {
178
- # # create fake data for the different ratios
179
- # "word_rep_ratios": np.random.randn(1000),
180
- # "char_rep_ratios": np.random.randn(1000),
181
- # "flagged_word_ratios": np.random.randn(1000),
182
- # "num_words": np.random.randint(0, 1000, 1000),
183
- # },
184
- # "USENET": {
185
- # # create fake data for the different ratios
186
- # "word_rep_ratios": np.random.randn(1000),
187
- # "char_rep_ratios": np.random.randn(1000),
188
- # "flagged_word_ratios": np.random.randn(1000),
189
- # "num_words": np.random.randint(0, 1000, 1000),
190
- # },
191
- # "USPTO": {
192
- # # create fake data for the different ratios
193
- # "word_rep_ratios": np.random.randn(1000),
194
- # "char_rep_ratios": np.random.randn(1000),
195
- # "flagged_word_ratios": np.random.randn(1000),
196
- # "num_words": np.random.randint(0, 1000, 1000),
197
- # },
198
- # "UbuntuIRC": {
199
- # # create fake data for the different ratios
200
- # "word_rep_ratios": np.random.randn(1000),
201
- # "char_rep_ratios": np.random.randn(1000),
202
- # "flagged_word_ratios": np.random.randn(1000),
203
- # "num_words": np.random.randint(0, 1000, 1000),
204
- # },
205
- # "arXiv": {
206
- # # create fake data for the different ratios
207
- # "word_rep_ratios": np.random.randn(1000),
208
- # "char_rep_ratios": np.random.randn(1000),
209
- # "flagged_word_ratios": np.random.randn(1000),
210
- # "num_words": np.random.randint(0, 1000, 1000),
211
- # },
212
- # }
213
-
214
  def plt_plot(ratio, dataset, threshold):
 
215
  x = dataset_data[dataset][ratio]
216
  # calculate percentage of data that will be removed given threshold
217
- perc = np.sum(x < threshold) / len(x)
218
  # create a figure
219
  fig = plt.figure()
220
  # add a subplot
@@ -233,22 +68,22 @@ def plt_plot(ratio, dataset, threshold):
233
  return fig
234
 
235
  with gr.Blocks() as demo:
236
- dataset = gr.Radio(list(dataset_data.keys()), label="Dataset", value="arXiv")
237
  print(dataset.value)
238
 
239
  with gr.Tab("Character Repetition Ratio"):
240
  # plot some random data
241
  plot = gr.Plot()
242
- threshold = gr.Slider(minimum=0, maximum=100, label="Threshold")
243
  calculate = gr.Button("Calculate")
244
- plot_fn = partial(plt_plot, "word_rep_ratios")
245
  calculate.click(plot_fn, [dataset, threshold], plot)
246
 
247
  with gr.Tab("Word Repetition Ratio"):# plot some random data
248
  plot = gr.Plot()
249
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
250
  calculate = gr.Button("Calculate")
251
- plot_fn = partial(plt_plot, "char_rep_ratios")
252
  calculate.click(plot_fn, [dataset, threshold], plot)
253
 
254
  with gr.Tab("Flagged Word Ratio"):# plot some random data
@@ -259,4 +94,4 @@ with gr.Blocks() as demo:
259
  calculate.click(plot_fn, [dataset, threshold], plot)
260
 
261
  if __name__ == "__main__":
262
- demo.launch(share=True)
 
2
  import matplotlib.pyplot as plt
3
  import numpy as np
4
  from functools import partial
 
5
  from datasets import load_dataset
6
 
7
+ dataset_names = [
8
+ "AI4Code",
9
+ "AMPS",
10
+ "ASFPublicMail",
11
+ "CPDataset",
12
+ "DMMath",
13
+ "Discourse",
14
+ "Enwiki",
15
+ "EuroParliamentProceedings",
16
+ "FreeLaw_Options",
17
+ "GithubDiff",
18
+ "GithubIssues",
19
+ "Gutenberg",
20
+ "LeetCode",
21
+ "PileOfLaw",
22
+ "PubMed",
23
+ "S2ORC",
24
+ "StackExchange",
25
+ "USENET",
26
+ "USPTO",
27
+ "UbuntuIRC",
28
+ "arXiv",
29
+ ]
30
 
31
+ dataset_data = {}
32
+ for name in dataset_names:
33
+ path = f"data/{name}/data.json"
34
+ ds = load_dataset(
35
+ "CarperAI/pilev2_smol_metadata",
36
+ data_files=path,
37
+ use_auth_token=True,
38
+ split="train",
39
+ # download_mode="force_redownload",
40
+ )
41
+ dataset_data[name] = {
42
+ "ds": ds,
43
+ "word_rep_ratios": np.random.randn(len(ds)),
44
+ "char_rep_ratios": np.array(ds["check_char_repetition_criteria"]),
45
+ "flagged_word_ratios": np.array(ds["check_flagged_words_criteria"]),
 
 
 
 
 
 
 
 
 
46
  }
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def plt_plot(ratio, dataset, threshold):
49
+ plt.close("all")
50
  x = dataset_data[dataset][ratio]
51
  # calculate percentage of data that will be removed given threshold
52
+ perc = np.sum(x > threshold) / len(x)
53
  # create a figure
54
  fig = plt.figure()
55
  # add a subplot
 
68
  return fig
69
 
70
  with gr.Blocks() as demo:
71
+ dataset = gr.Radio(dataset_names, label="Dataset", value="arXiv")
72
  print(dataset.value)
73
 
74
  with gr.Tab("Character Repetition Ratio"):
75
  # plot some random data
76
  plot = gr.Plot()
77
+ threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
78
  calculate = gr.Button("Calculate")
79
+ plot_fn = partial(plt_plot, "char_rep_ratios")
80
  calculate.click(plot_fn, [dataset, threshold], plot)
81
 
82
  with gr.Tab("Word Repetition Ratio"):# plot some random data
83
  plot = gr.Plot()
84
  threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
85
  calculate = gr.Button("Calculate")
86
+ plot_fn = partial(plt_plot, "word_rep_ratios")
87
  calculate.click(plot_fn, [dataset, threshold], plot)
88
 
89
  with gr.Tab("Flagged Word Ratio"):# plot some random data
 
94
  calculate.click(plot_fn, [dataset, threshold], plot)
95
 
96
  if __name__ == "__main__":
97
+ demo.launch()