ncoop57 commited on
Commit
4c20fbb
1 Parent(s): 82935d8

Have initial setup of layout and fake data

Browse files
Files changed (2) hide show
  1. app.py +231 -20
  2. requirements.txt +2 -0
app.py CHANGED
@@ -1,27 +1,238 @@
1
  import gradio as gr
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- def sentence_builder(quantity, animal, place, activity_list, morning):
5
- return f"""The {quantity} {animal}s went to the {place} where they {" and ".join(activity_list)} until the {"morning" if morning else "night"}"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- demo = gr.Interface(
9
- sentence_builder,
10
- [
11
- gr.Slider(2, 20, value=4),
12
- gr.Dropdown(["cat", "dog", "bird"]),
13
- gr.Radio(["park", "zoo", "road"]),
14
- gr.CheckboxGroup(["ran", "swam", "ate", "slept"]),
15
- gr.Checkbox(label="Is it the morning?"),
16
- ],
17
- "text",
18
- examples=[
19
- [2, "cat", "park", ["ran", "swam"], True],
20
- [4, "dog", "zoo", ["ate", "swam"], False],
21
- [10, "bird", "road", ["ran"], False],
22
- [8, "cat", "zoo", ["ate"], True],
23
- ],
24
- )
 
 
 
 
 
 
25
 
26
  if __name__ == "__main__":
27
- demo.launch()
 
1
  import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
 
5
+ # ai4code_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AI4Code")
6
+ # amps_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/AMPS")
7
+ # apache_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/ASFPublicMail")
8
+ # books3_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Books3")
9
+ # cp_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/CPDataset")
10
+ # dmmath_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/DMMath")
11
+ # discourse_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Discourse")
12
+ # wiki_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Enwiki")
13
+ # euro_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/EuroParliamentProceedings")
14
+ # freelaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/FreeLaw_Options")
15
+ # ghdiffs_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubDiff")
16
+ # ghissues_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/GitHubIssues")
17
+ # gutenberg_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/Gutenberg")
18
+ # leet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/LeetCode")
19
+ # pileoflaw_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PileOfLaw")
20
+ # pubmed_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/PubMed")
21
+ # s2orc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/S2ORC")
22
+ # se_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/StackExchange")
23
+ # usenet_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USENET")
24
+ # uspto_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/USPTO")
25
+ # ubuntuirc_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/UbuntuIRC")
26
+ # arxiv_ds = load_dataset("CarperAI/pile-v2-small", data_dir="data/arXiv")
27
 
28
+ dataset_data = {
29
+ "AI4Code": {
30
+ # create fake data for the different ratios
31
+ "word_rep_ratios": np.random.randn(1000),
32
+ "char_rep_ratios": np.random.randn(1000),
33
+ "flagged_word_ratios": np.random.randn(1000),
34
+ "num_words": np.random.randint(0, 1000, 1000),
35
+ },
36
+ "AMPS": {
37
+ # create fake data for the different ratios
38
+ "word_rep_ratios": np.random.randn(1000),
39
+ "char_rep_ratios": np.random.randn(1000),
40
+ "flagged_word_ratios": np.random.randn(1000),
41
+ "num_words": np.random.randint(0, 1000, 1000),
42
+ },
43
+ "ASFPublicMail": {
44
+ # create fake data for the different ratios
45
+ "word_rep_ratios": np.random.randn(1000),
46
+ "char_rep_ratios": np.random.randn(1000),
47
+ "flagged_word_ratios": np.random.randn(1000),
48
+ "num_words": np.random.randint(0, 1000, 1000),
49
+ },
50
+ "Books3": {
51
+ # create fake data for the different ratios
52
+ "word_rep_ratios": np.random.randn(1000),
53
+ "char_rep_ratios": np.random.randn(1000),
54
+ "flagged_word_ratios": np.random.randn(1000),
55
+ "num_words": np.random.randint(0, 1000, 1000),
56
+ },
57
+ "CPDataset": {
58
+ # create fake data for the different ratios
59
+ "word_rep_ratios": np.random.randn(1000),
60
+ "char_rep_ratios": np.random.randn(1000),
61
+ "flagged_word_ratios": np.random.randn(1000),
62
+ "num_words": np.random.randint(0, 1000, 1000),
63
+ },
64
+ "DMMath": {
65
+ # create fake data for the different ratios
66
+ "word_rep_ratios": np.random.randn(1000),
67
+ "char_rep_ratios": np.random.randn(1000),
68
+ "flagged_word_ratios": np.random.randn(1000),
69
+ "num_words": np.random.randint(0, 1000, 1000),
70
+ },
71
+ "Discourse": {
72
+ # create fake data for the different ratios
73
+ "word_rep_ratios": np.random.randn(1000),
74
+ "char_rep_ratios": np.random.randn(1000),
75
+ "flagged_word_ratios": np.random.randn(1000),
76
+ "num_words": np.random.randint(0, 1000, 1000),
77
+ },
78
+ "Enwiki": {
79
+ # create fake data for the different ratios
80
+ "word_rep_ratios": np.random.randn(1000),
81
+ "char_rep_ratios": np.random.randn(1000),
82
+ "flagged_word_ratios": np.random.randn(1000),
83
+ "num_words": np.random.randint(0, 1000, 1000),
84
+ },
85
+ "EuroParliamentProceedings": {
86
+ # create fake data for the different ratios
87
+ "word_rep_ratios": np.random.randn(1000),
88
+ "char_rep_ratios": np.random.randn(1000),
89
+ "flagged_word_ratios": np.random.randn(1000),
90
+ "num_words": np.random.randint(0, 1000, 1000),
91
+ },
92
+ "FreeLaw_Options": {
93
+ # create fake data for the different ratios
94
+ "word_rep_ratios": np.random.randn(1000),
95
+ "char_rep_ratios": np.random.randn(1000),
96
+ "flagged_word_ratios": np.random.randn(1000),
97
+ "num_words": np.random.randint(0, 1000, 1000),
98
+ },
99
+ "GitHubDiff": {
100
+ # create fake data for the different ratios
101
+ "word_rep_ratios": np.random.randn(1000),
102
+ "char_rep_ratios": np.random.randn(1000),
103
+ "flagged_word_ratios": np.random.randn(1000),
104
+ "num_words": np.random.randint(0, 1000, 1000),
105
+ },
106
+ "GitHubIssues": {
107
+ # create fake data for the different ratios
108
+ "word_rep_ratios": np.random.randn(1000),
109
+ "char_rep_ratios": np.random.randn(1000),
110
+ "flagged_word_ratios": np.random.randn(1000),
111
+ "num_words": np.random.randint(0, 1000, 1000),
112
+ },
113
+ "Gutenberg": {
114
+ # create fake data for the different ratios
115
+ "word_rep_ratios": np.random.randn(1000),
116
+ "char_rep_ratios": np.random.randn(1000),
117
+ "flagged_word_ratios": np.random.randn(1000),
118
+ "num_words": np.random.randint(0, 1000, 1000),
119
+ },
120
+ "LeetCode": {
121
+ # create fake data for the different ratios
122
+ "word_rep_ratios": np.random.randn(1000),
123
+ "char_rep_ratios": np.random.randn(1000),
124
+ "flagged_word_ratios": np.random.randn(1000),
125
+ "num_words": np.random.randint(0, 1000, 1000),
126
+ },
127
+ "PileOfLaw": {
128
+ # create fake data for the different ratios
129
+ "word_rep_ratios": np.random.randn(1000),
130
+ "char_rep_ratios": np.random.randn(1000),
131
+ "flagged_word_ratios": np.random.randn(1000),
132
+ "num_words": np.random.randint(0, 1000, 1000),
133
+ },
134
+ "PubMed": {
135
+ # create fake data for the different ratios
136
+ "word_rep_ratios": np.random.randn(1000),
137
+ "char_rep_ratios": np.random.randn(1000),
138
+ "flagged_word_ratios": np.random.randn(1000),
139
+ "num_words": np.random.randint(0, 1000, 1000),
140
+ },
141
+ "S2ORC": {
142
+ # create fake data for the different ratios
143
+ "word_rep_ratios": np.random.randn(1000),
144
+ "char_rep_ratios": np.random.randn(1000),
145
+ "flagged_word_ratios": np.random.randn(1000),
146
+ "num_words": np.random.randint(0, 1000, 1000),
147
+ },
148
+ "StackExchange": {
149
+ # create fake data for the different ratios
150
+ "word_rep_ratios": np.random.randn(1000),
151
+ "char_rep_ratios": np.random.randn(1000),
152
+ "flagged_word_ratios": np.random.randn(1000),
153
+ "num_words": np.random.randint(0, 1000, 1000),
154
+ },
155
+ "USENET": {
156
+ # create fake data for the different ratios
157
+ "word_rep_ratios": np.random.randn(1000),
158
+ "char_rep_ratios": np.random.randn(1000),
159
+ "flagged_word_ratios": np.random.randn(1000),
160
+ "num_words": np.random.randint(0, 1000, 1000),
161
+ },
162
+ "USPTO": {
163
+ # create fake data for the different ratios
164
+ "word_rep_ratios": np.random.randn(1000),
165
+ "char_rep_ratios": np.random.randn(1000),
166
+ "flagged_word_ratios": np.random.randn(1000),
167
+ "num_words": np.random.randint(0, 1000, 1000),
168
+ },
169
+ "UbuntuIRC": {
170
+ # create fake data for the different ratios
171
+ "word_rep_ratios": np.random.randn(1000),
172
+ "char_rep_ratios": np.random.randn(1000),
173
+ "flagged_word_ratios": np.random.randn(1000),
174
+ "num_words": np.random.randint(0, 1000, 1000),
175
+ },
176
+ "arXiv": {
177
+ # create fake data for the different ratios
178
+ "word_rep_ratios": np.random.randn(1000),
179
+ "char_rep_ratios": np.random.randn(1000),
180
+ "flagged_word_ratios": np.random.randn(1000),
181
+ "num_words": np.random.randint(0, 1000, 1000),
182
+ },
183
+ }
184
 
185
+ def plt_plot(threshold, x):
186
+ # prepare some data for a histogram
187
+ # x = np.random.randn(1000)
188
+ # create a figure
189
+ fig = plt.figure()
190
+ # add a subplot
191
+ ax = fig.add_subplot(111)
192
+ # plot some data
193
+ ax.hist(x, bins=50)
194
+ # plot red dashed line at threshold
195
+ ax.axvline(threshold, color='r', linestyle='dashed', linewidth=2)
196
+ plt.title("Histogram of random data")
197
+ plt.xlabel("Value")
198
+ plt.ylabel("Frequency")
199
+ return fig
200
+ # x = ["Math", "Business", "Statistics", "IT", "Commerce"]
201
+ # y = [68, 73, 82, 74, 85]
202
+ # # create a new plot
203
+ # plt.rcParams['figure.figsize'] = 6,4
204
+ # fig = plt.figure()
205
+ # ax = fig.add_axes([0,0,1,1])
206
+ # ax.bar(x, y)
207
+ # plot red dashed line at threshold
208
+ # plt.axhline(y=threshold, color='r', linestyle='--')
209
+ # plt.title("Marks per subject")
210
+ # plt.xlabel("Subject")
211
+ # plt.ylabel("Score")
212
 
213
+ # return fig
214
+
215
+ with gr.Blocks() as demo:
216
+ dataset = gr.Radio(list(dataset_data.keys()), label="Dataset")
217
+
218
+ with gr.Tab("Character Repetition Ratio"):
219
+ # plot some random data
220
+ plot = gr.Plot()
221
+ threshold = gr.Slider(minimum=0, maximum=100, label="Threshold")
222
+ calculate = gr.Button("Calculate")
223
+ calculate.click(plt_plot, [threshold, dataset_data[dataset].char_rep_ratios], plot)
224
+
225
+ with gr.Tab("Word Repetition Ratio"):# plot some random data
226
+ plot = gr.Plot()
227
+ threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
228
+ calculate = gr.Button("Calculate")
229
+ calculate.click(plt_plot, [threshold, dataset_data[dataset].word_rep_ratios], plot)
230
+
231
+ with gr.Tab("Flagged Word Ratio"):# plot some random data
232
+ plot = gr.Plot()
233
+ threshold = gr.Slider(minimum=0, maximum=1, label="Threshold")
234
+ calculate = gr.Button("Calculate")
235
+ calculate.click(plt_plot, [threshold, dataset_data[dataset].flagged_word_ratios], plot)
236
 
237
  if __name__ == "__main__":
238
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scrubadub
2
+ squeakily