victormiller
commited on
Commit
•
adcd5e6
1
Parent(s):
8061116
Update main.py
Browse files
main.py
CHANGED
@@ -178,7 +178,7 @@ def main():
|
|
178 |
new_dataset_comparison1 = pd.DataFrame(
|
179 |
{
|
180 |
"Data Source": [
|
181 |
-
"CommonCrawl",
|
182 |
"Papers",
|
183 |
"Wikipedia",
|
184 |
"FreeLaw",
|
@@ -193,7 +193,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
193 |
|
194 |
],
|
195 |
"TxT360": [
|
196 |
-
"99
|
197 |
"5 Sources",
|
198 |
"310+ Languages",
|
199 |
"Included",
|
@@ -207,7 +207,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
207 |
"**",
|
208 |
],
|
209 |
"FineWeb": [
|
210 |
-
"96
|
211 |
"-",
|
212 |
"-",
|
213 |
"-",
|
@@ -221,7 +221,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
221 |
"-",
|
222 |
],
|
223 |
"RefinedWeb": [
|
224 |
-
"90
|
225 |
"-",
|
226 |
"-",
|
227 |
"-",
|
@@ -234,8 +234,8 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
234 |
"-",
|
235 |
"-",
|
236 |
],
|
237 |
-
"
|
238 |
-
"84
|
239 |
"-",
|
240 |
"-",
|
241 |
"-",
|
@@ -249,7 +249,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
249 |
"-",
|
250 |
],
|
251 |
"C4": [
|
252 |
-
"1
|
253 |
"-",
|
254 |
"-",
|
255 |
"-",
|
@@ -263,7 +263,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
263 |
"-",
|
264 |
],
|
265 |
"Dolma": [
|
266 |
-
"24
|
267 |
"1 Source",
|
268 |
"checkmark",
|
269 |
"-",
|
@@ -276,8 +276,8 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
276 |
"-",
|
277 |
"Included",
|
278 |
],
|
279 |
-
"
|
280 |
-
"5
|
281 |
"1 Source",
|
282 |
"checkmark",
|
283 |
"",
|
@@ -291,7 +291,7 @@ new_dataset_comparison1 = pd.DataFrame(
|
|
291 |
"Included",
|
292 |
],
|
293 |
"The Pile": [
|
294 |
-
"0.6% of 74
|
295 |
"4 Sources",
|
296 |
"English Only",
|
297 |
"Included",
|
@@ -636,8 +636,8 @@ def intro():
|
|
636 |
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
|
637 |
),
|
638 |
new_table_div_1,
|
639 |
-
table_div_1,
|
640 |
-
table_div_2,
|
641 |
P(
|
642 |
"In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
|
643 |
),
|
|
|
178 |
new_dataset_comparison1 = pd.DataFrame(
|
179 |
{
|
180 |
"Data Source": [
|
181 |
+
"CommonCrawl Snapshots",
|
182 |
"Papers",
|
183 |
"Wikipedia",
|
184 |
"FreeLaw",
|
|
|
193 |
|
194 |
],
|
195 |
"TxT360": [
|
196 |
+
"99",
|
197 |
"5 Sources",
|
198 |
"310+ Languages",
|
199 |
"Included",
|
|
|
207 |
"**",
|
208 |
],
|
209 |
"FineWeb": [
|
210 |
+
"96",
|
211 |
"-",
|
212 |
"-",
|
213 |
"-",
|
|
|
221 |
"-",
|
222 |
],
|
223 |
"RefinedWeb": [
|
224 |
+
"90",
|
225 |
"-",
|
226 |
"-",
|
227 |
"-",
|
|
|
234 |
"-",
|
235 |
"-",
|
236 |
],
|
237 |
+
"PedPajamaV2": [
|
238 |
+
"84",
|
239 |
"-",
|
240 |
"-",
|
241 |
"-",
|
|
|
249 |
"-",
|
250 |
],
|
251 |
"C4": [
|
252 |
+
"1",
|
253 |
"-",
|
254 |
"-",
|
255 |
"-",
|
|
|
263 |
"-",
|
264 |
],
|
265 |
"Dolma": [
|
266 |
+
"24",
|
267 |
"1 Source",
|
268 |
"checkmark",
|
269 |
"-",
|
|
|
276 |
"-",
|
277 |
"Included",
|
278 |
],
|
279 |
+
"RedPajamaV1": [
|
280 |
+
"5",
|
281 |
"1 Source",
|
282 |
"checkmark",
|
283 |
"",
|
|
|
291 |
"Included",
|
292 |
],
|
293 |
"The Pile": [
|
294 |
+
"0.6% of 74",
|
295 |
"4 Sources",
|
296 |
"English Only",
|
297 |
"Included",
|
|
|
636 |
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
|
637 |
),
|
638 |
new_table_div_1,
|
639 |
+
#table_div_1,
|
640 |
+
#table_div_2,
|
641 |
P(
|
642 |
"In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
|
643 |
),
|