TxT360

Sleeping

victormiller commited on Sep 26

Commit

8bb7e3f

•

1 Parent(s): 5b83110

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -133,11 +133,11 @@ intro_text = P("Pretraining performant large language models (LLMs) requires tri
 intro_list = P("We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:")
 intro_list1 = Ol(
-                Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
-                Li("Employs carefully selected filters designed for each data source"),
-                Li("Provides only unique data elements via globally deduplicated across all datasets"),
-                Li("Retains all deduplication metadata for custom upweighting"),
-                Li("Is Production ready! Download here [link to HF repo]")
 )
 previous_intro = P("""We are excited to introduce TxT360, a

 intro_list = P("We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:")
 intro_list1 = Ol(
+                Li("Curates commonly used pretraining datasets, including all CommonCrawl", style = "margin-bottom: 5px"),
+                Li("Employs carefully selected filters designed for each data source", style = "margin-bottom: 5px"),
+                Li("Provides only unique data elements via globally deduplicated across all datasets", style = "margin-bottom: 5px"),
+                Li("Retains all deduplication metadata for custom upweighting", style = "margin-bottom: 5px"),
+                Li("Is Production ready! Download here [link to HF repo]", style = "margin-bottom: 5px")
 )
 previous_intro = P("""We are excited to introduce TxT360, a