victormiller
commited on
Commit
•
8bb7e3f
1
Parent(s):
5b83110
Update main.py
Browse files
main.py
CHANGED
@@ -133,11 +133,11 @@ intro_text = P("Pretraining performant large language models (LLMs) requires tri
|
|
133 |
intro_list = P("We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:")
|
134 |
|
135 |
intro_list1 = Ol(
|
136 |
-
Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
|
137 |
-
Li("Employs carefully selected filters designed for each data source"),
|
138 |
-
Li("Provides only unique data elements via globally deduplicated across all datasets"),
|
139 |
-
Li("Retains all deduplication metadata for custom upweighting"),
|
140 |
-
Li("Is Production ready! Download here [link to HF repo]")
|
141 |
)
|
142 |
|
143 |
previous_intro = P("""We are excited to introduce TxT360, a
|
|
|
133 |
intro_list = P("We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:")
|
134 |
|
135 |
intro_list1 = Ol(
|
136 |
+
Li("Curates commonly used pretraining datasets, including all CommonCrawl", style = "margin-bottom: 5px"),
|
137 |
+
Li("Employs carefully selected filters designed for each data source", style = "margin-bottom: 5px"),
|
138 |
+
Li("Provides only unique data elements via globally deduplicated across all datasets", style = "margin-bottom: 5px"),
|
139 |
+
Li("Retains all deduplication metadata for custom upweighting", style = "margin-bottom: 5px"),
|
140 |
+
Li("Is Production ready! Download here [link to HF repo]", style = "margin-bottom: 5px")
|
141 |
)
|
142 |
|
143 |
previous_intro = P("""We are excited to introduce TxT360, a
|