Spaces:
Running
Running
victormiller
commited on
Commit
•
a552bff
1
Parent(s):
506b4ce
Update main.py
Browse files
main.py
CHANGED
@@ -123,10 +123,15 @@ def main():
|
|
123 |
),
|
124 |
)
|
125 |
|
126 |
-
intro_text = P(
|
127 |
-
"
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
-
intro_list = P("
|
130 |
|
131 |
intro_list1 = Ol(
|
132 |
Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
|
|
|
123 |
),
|
124 |
)
|
125 |
|
126 |
+
intro_text = P("Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
|
127 |
+
A("Amber-7B", href = "https://huggingface.co/LLM360/Amber"),
|
128 |
+
", ",
|
129 |
+
A("Crystal-7B", href = "https://huggingface.co/LLM360/CrystalCoder"),
|
130 |
+
", ",
|
131 |
+
A("K2-65B", href = "https://huggingface.co/LLM360/K2"),
|
132 |
+
"have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",)
|
133 |
|
134 |
+
intro_list = P("We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:")
|
135 |
|
136 |
intro_list1 = Ol(
|
137 |
Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
|