victormiller
commited on
Commit
•
4d3b786
1
Parent(s):
7ab95df
Update main.py
Browse files
main.py
CHANGED
@@ -809,6 +809,7 @@ def intro():
|
|
809 |
P(
|
810 |
"In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
|
811 |
),
|
|
|
812 |
#P("Table 2: Basic TxT360 Statistics."),
|
813 |
#table_div_data,
|
814 |
id="section2",
|
|
|
809 |
P(
|
810 |
"In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
|
811 |
),
|
812 |
+
P("** TxT360 does not include code. This decision was made due to the perceived low duplication count of code. TxT360 can easily be combined with leading code dataset."),
|
813 |
#P("Table 2: Basic TxT360 Statistics."),
|
814 |
#table_div_data,
|
815 |
id="section2",
|