akhilhsingh commited on
Commit
cc8c661
1 Parent(s): ac00c33

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -404
README.md CHANGED
@@ -8,413 +8,9 @@ sdk_version: 1.35.0
8
  app_file: app.py
9
  pinned: false
10
 
11
- license: odc-by
12
- task_categories:
13
- - text-generation
14
- language:
15
- - en
16
- pretty_name: FineWeb
17
- size_categories:
18
- - n>1T
19
  configs:
20
  - config_name: default
21
  data_files:
22
  - split: train
23
  path: data/*/*
24
- - config_name: sample-10BT
25
- data_files:
26
- - split: train
27
- path: sample/10BT/*
28
- - config_name: sample-100BT
29
- data_files:
30
- - split: train
31
- path: sample/100BT/*
32
- - config_name: sample-350BT
33
- data_files:
34
- - split: train
35
- path: sample/350BT/*
36
- - config_name: CC-MAIN-2024-18
37
- data_files:
38
- - split: train
39
- path: data/CC-MAIN-2024-18/*
40
- - config_name: CC-MAIN-2024-10
41
- data_files:
42
- - split: train
43
- path: data/CC-MAIN-2024-10/*
44
- - config_name: CC-MAIN-2023-50
45
- data_files:
46
- - split: train
47
- path: data/CC-MAIN-2023-50/*
48
- - config_name: CC-MAIN-2023-40
49
- data_files:
50
- - split: train
51
- path: data/CC-MAIN-2023-40/*
52
- - config_name: CC-MAIN-2023-23
53
- data_files:
54
- - split: train
55
- path: data/CC-MAIN-2023-23/*
56
- - config_name: CC-MAIN-2023-14
57
- data_files:
58
- - split: train
59
- path: data/CC-MAIN-2023-14/*
60
- - config_name: CC-MAIN-2023-06
61
- data_files:
62
- - split: train
63
- path: data/CC-MAIN-2023-06/*
64
- - config_name: CC-MAIN-2022-49
65
- data_files:
66
- - split: train
67
- path: data/CC-MAIN-2022-49/*
68
- - config_name: CC-MAIN-2022-40
69
- data_files:
70
- - split: train
71
- path: data/CC-MAIN-2022-40/*
72
- - config_name: CC-MAIN-2022-33
73
- data_files:
74
- - split: train
75
- path: data/CC-MAIN-2022-33/*
76
- - config_name: CC-MAIN-2022-27
77
- data_files:
78
- - split: train
79
- path: data/CC-MAIN-2022-27/*
80
- - config_name: CC-MAIN-2022-21
81
- data_files:
82
- - split: train
83
- path: data/CC-MAIN-2022-21/*
84
- - config_name: CC-MAIN-2022-05
85
- data_files:
86
- - split: train
87
- path: data/CC-MAIN-2022-05/*
88
- - config_name: CC-MAIN-2021-49
89
- data_files:
90
- - split: train
91
- path: data/CC-MAIN-2021-49/*
92
- - config_name: CC-MAIN-2021-43
93
- data_files:
94
- - split: train
95
- path: data/CC-MAIN-2021-43/*
96
- - config_name: CC-MAIN-2021-39
97
- data_files:
98
- - split: train
99
- path: data/CC-MAIN-2021-39/*
100
- - config_name: CC-MAIN-2021-31
101
- data_files:
102
- - split: train
103
- path: data/CC-MAIN-2021-31/*
104
- - config_name: CC-MAIN-2021-25
105
- data_files:
106
- - split: train
107
- path: data/CC-MAIN-2021-25/*
108
- - config_name: CC-MAIN-2021-21
109
- data_files:
110
- - split: train
111
- path: data/CC-MAIN-2021-21/*
112
- - config_name: CC-MAIN-2021-17
113
- data_files:
114
- - split: train
115
- path: data/CC-MAIN-2021-17/*
116
- - config_name: CC-MAIN-2021-10
117
- data_files:
118
- - split: train
119
- path: data/CC-MAIN-2021-10/*
120
- - config_name: CC-MAIN-2021-04
121
- data_files:
122
- - split: train
123
- path: data/CC-MAIN-2021-04/*
124
- - config_name: CC-MAIN-2020-50
125
- data_files:
126
- - split: train
127
- path: data/CC-MAIN-2020-50/*
128
- - config_name: CC-MAIN-2020-45
129
- data_files:
130
- - split: train
131
- path: data/CC-MAIN-2020-45/*
132
- - config_name: CC-MAIN-2020-40
133
- data_files:
134
- - split: train
135
- path: data/CC-MAIN-2020-40/*
136
- - config_name: CC-MAIN-2020-34
137
- data_files:
138
- - split: train
139
- path: data/CC-MAIN-2020-34/*
140
- - config_name: CC-MAIN-2020-29
141
- data_files:
142
- - split: train
143
- path: data/CC-MAIN-2020-29/*
144
- - config_name: CC-MAIN-2020-24
145
- data_files:
146
- - split: train
147
- path: data/CC-MAIN-2020-24/*
148
- - config_name: CC-MAIN-2020-16
149
- data_files:
150
- - split: train
151
- path: data/CC-MAIN-2020-16/*
152
- - config_name: CC-MAIN-2020-10
153
- data_files:
154
- - split: train
155
- path: data/CC-MAIN-2020-10/*
156
- - config_name: CC-MAIN-2020-05
157
- data_files:
158
- - split: train
159
- path: data/CC-MAIN-2020-05/*
160
- - config_name: CC-MAIN-2019-51
161
- data_files:
162
- - split: train
163
- path: data/CC-MAIN-2019-51/*
164
- - config_name: CC-MAIN-2019-47
165
- data_files:
166
- - split: train
167
- path: data/CC-MAIN-2019-47/*
168
- - config_name: CC-MAIN-2019-43
169
- data_files:
170
- - split: train
171
- path: data/CC-MAIN-2019-43/*
172
- - config_name: CC-MAIN-2019-39
173
- data_files:
174
- - split: train
175
- path: data/CC-MAIN-2019-39/*
176
- - config_name: CC-MAIN-2019-35
177
- data_files:
178
- - split: train
179
- path: data/CC-MAIN-2019-35/*
180
- - config_name: CC-MAIN-2019-30
181
- data_files:
182
- - split: train
183
- path: data/CC-MAIN-2019-30/*
184
- - config_name: CC-MAIN-2019-26
185
- data_files:
186
- - split: train
187
- path: data/CC-MAIN-2019-26/*
188
- - config_name: CC-MAIN-2019-22
189
- data_files:
190
- - split: train
191
- path: data/CC-MAIN-2019-22/*
192
- - config_name: CC-MAIN-2019-18
193
- data_files:
194
- - split: train
195
- path: data/CC-MAIN-2019-18/*
196
- - config_name: CC-MAIN-2019-13
197
- data_files:
198
- - split: train
199
- path: data/CC-MAIN-2019-13/*
200
- - config_name: CC-MAIN-2019-09
201
- data_files:
202
- - split: train
203
- path: data/CC-MAIN-2019-09/*
204
- - config_name: CC-MAIN-2019-04
205
- data_files:
206
- - split: train
207
- path: data/CC-MAIN-2019-04/*
208
- - config_name: CC-MAIN-2018-51
209
- data_files:
210
- - split: train
211
- path: data/CC-MAIN-2018-51/*
212
- - config_name: CC-MAIN-2018-47
213
- data_files:
214
- - split: train
215
- path: data/CC-MAIN-2018-47/*
216
- - config_name: CC-MAIN-2018-43
217
- data_files:
218
- - split: train
219
- path: data/CC-MAIN-2018-43/*
220
- - config_name: CC-MAIN-2018-39
221
- data_files:
222
- - split: train
223
- path: data/CC-MAIN-2018-39/*
224
- - config_name: CC-MAIN-2018-34
225
- data_files:
226
- - split: train
227
- path: data/CC-MAIN-2018-34/*
228
- - config_name: CC-MAIN-2018-30
229
- data_files:
230
- - split: train
231
- path: data/CC-MAIN-2018-30/*
232
- - config_name: CC-MAIN-2018-26
233
- data_files:
234
- - split: train
235
- path: data/CC-MAIN-2018-26/*
236
- - config_name: CC-MAIN-2018-22
237
- data_files:
238
- - split: train
239
- path: data/CC-MAIN-2018-22/*
240
- - config_name: CC-MAIN-2018-17
241
- data_files:
242
- - split: train
243
- path: data/CC-MAIN-2018-17/*
244
- - config_name: CC-MAIN-2018-13
245
- data_files:
246
- - split: train
247
- path: data/CC-MAIN-2018-13/*
248
- - config_name: CC-MAIN-2018-09
249
- data_files:
250
- - split: train
251
- path: data/CC-MAIN-2018-09/*
252
- - config_name: CC-MAIN-2018-05
253
- data_files:
254
- - split: train
255
- path: data/CC-MAIN-2018-05/*
256
- - config_name: CC-MAIN-2017-51
257
- data_files:
258
- - split: train
259
- path: data/CC-MAIN-2017-51/*
260
- - config_name: CC-MAIN-2017-47
261
- data_files:
262
- - split: train
263
- path: data/CC-MAIN-2017-47/*
264
- - config_name: CC-MAIN-2017-43
265
- data_files:
266
- - split: train
267
- path: data/CC-MAIN-2017-43/*
268
- - config_name: CC-MAIN-2017-39
269
- data_files:
270
- - split: train
271
- path: data/CC-MAIN-2017-39/*
272
- - config_name: CC-MAIN-2017-34
273
- data_files:
274
- - split: train
275
- path: data/CC-MAIN-2017-34/*
276
- - config_name: CC-MAIN-2017-30
277
- data_files:
278
- - split: train
279
- path: data/CC-MAIN-2017-30/*
280
- - config_name: CC-MAIN-2017-26
281
- data_files:
282
- - split: train
283
- path: data/CC-MAIN-2017-26/*
284
- - config_name: CC-MAIN-2017-22
285
- data_files:
286
- - split: train
287
- path: data/CC-MAIN-2017-22/*
288
- - config_name: CC-MAIN-2017-17
289
- data_files:
290
- - split: train
291
- path: data/CC-MAIN-2017-17/*
292
- - config_name: CC-MAIN-2017-13
293
- data_files:
294
- - split: train
295
- path: data/CC-MAIN-2017-13/*
296
- - config_name: CC-MAIN-2017-09
297
- data_files:
298
- - split: train
299
- path: data/CC-MAIN-2017-09/*
300
- - config_name: CC-MAIN-2017-04
301
- data_files:
302
- - split: train
303
- path: data/CC-MAIN-2017-04/*
304
- - config_name: CC-MAIN-2016-50
305
- data_files:
306
- - split: train
307
- path: data/CC-MAIN-2016-50/*
308
- - config_name: CC-MAIN-2016-44
309
- data_files:
310
- - split: train
311
- path: data/CC-MAIN-2016-44/*
312
- - config_name: CC-MAIN-2016-40
313
- data_files:
314
- - split: train
315
- path: data/CC-MAIN-2016-40/*
316
- - config_name: CC-MAIN-2016-36
317
- data_files:
318
- - split: train
319
- path: data/CC-MAIN-2016-36/*
320
- - config_name: CC-MAIN-2016-30
321
- data_files:
322
- - split: train
323
- path: data/CC-MAIN-2016-30/*
324
- - config_name: CC-MAIN-2016-26
325
- data_files:
326
- - split: train
327
- path: data/CC-MAIN-2016-26/*
328
- - config_name: CC-MAIN-2016-22
329
- data_files:
330
- - split: train
331
- path: data/CC-MAIN-2016-22/*
332
- - config_name: CC-MAIN-2016-18
333
- data_files:
334
- - split: train
335
- path: data/CC-MAIN-2016-18/*
336
- - config_name: CC-MAIN-2016-07
337
- data_files:
338
- - split: train
339
- path: data/CC-MAIN-2016-07/*
340
- - config_name: CC-MAIN-2015-48
341
- data_files:
342
- - split: train
343
- path: data/CC-MAIN-2015-48/*
344
- - config_name: CC-MAIN-2015-40
345
- data_files:
346
- - split: train
347
- path: data/CC-MAIN-2015-40/*
348
- - config_name: CC-MAIN-2015-35
349
- data_files:
350
- - split: train
351
- path: data/CC-MAIN-2015-35/*
352
- - config_name: CC-MAIN-2015-32
353
- data_files:
354
- - split: train
355
- path: data/CC-MAIN-2015-32/*
356
- - config_name: CC-MAIN-2015-27
357
- data_files:
358
- - split: train
359
- path: data/CC-MAIN-2015-27/*
360
- - config_name: CC-MAIN-2015-22
361
- data_files:
362
- - split: train
363
- path: data/CC-MAIN-2015-22/*
364
- - config_name: CC-MAIN-2015-18
365
- data_files:
366
- - split: train
367
- path: data/CC-MAIN-2015-18/*
368
- - config_name: CC-MAIN-2015-14
369
- data_files:
370
- - split: train
371
- path: data/CC-MAIN-2015-14/*
372
- - config_name: CC-MAIN-2015-11
373
- data_files:
374
- - split: train
375
- path: data/CC-MAIN-2015-11/*
376
- - config_name: CC-MAIN-2015-06
377
- data_files:
378
- - split: train
379
- path: data/CC-MAIN-2015-06/*
380
- - config_name: CC-MAIN-2014-52
381
- data_files:
382
- - split: train
383
- path: data/CC-MAIN-2014-52/*
384
- - config_name: CC-MAIN-2014-49
385
- data_files:
386
- - split: train
387
- path: data/CC-MAIN-2014-49/*
388
- - config_name: CC-MAIN-2014-42
389
- data_files:
390
- - split: train
391
- path: data/CC-MAIN-2014-42/*
392
- - config_name: CC-MAIN-2014-41
393
- data_files:
394
- - split: train
395
- path: data/CC-MAIN-2014-41/*
396
- - config_name: CC-MAIN-2014-35
397
- data_files:
398
- - split: train
399
- path: data/CC-MAIN-2014-35/*
400
- - config_name: CC-MAIN-2014-23
401
- data_files:
402
- - split: train
403
- path: data/CC-MAIN-2014-23/*
404
- - config_name: CC-MAIN-2014-15
405
- data_files:
406
- - split: train
407
- path: data/CC-MAIN-2014-15/*
408
- - config_name: CC-MAIN-2014-10
409
- data_files:
410
- - split: train
411
- path: data/CC-MAIN-2014-10/*
412
- - config_name: CC-MAIN-2013-48
413
- data_files:
414
- - split: train
415
- path: data/CC-MAIN-2013-48/*
416
- - config_name: CC-MAIN-2013-20
417
- data_files:
418
- - split: train
419
- path: data/CC-MAIN-2013-20/*
420
  ---
 
8
  app_file: app.py
9
  pinned: false
10
 
 
 
 
 
 
 
 
 
11
  configs:
12
  - config_name: default
13
  data_files:
14
  - split: train
15
  path: data/*/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  ---