hunterhector
commited on
Commit
•
ac7d8cf
1
Parent(s):
d098e08
text fix
Browse files- common.py +3 -3
- main.py +5 -4
- overview.py +3 -3
common.py
CHANGED
@@ -288,7 +288,7 @@ global_div = Div(
|
|
288 |
"This section discusses all details related to deduplication and filterings steps that were uniformly applied to all data. The section is split into the following topic areas: "
|
289 |
),
|
290 |
Ul(
|
291 |
-
Li("
|
292 |
Li(
|
293 |
"TxT360 Deduplication Process and Implementation",
|
294 |
style="margin-bottom: 5px",
|
@@ -302,7 +302,7 @@ global_div = Div(
|
|
302 |
id="section41",
|
303 |
),
|
304 |
Section(
|
305 |
-
H2("
|
306 |
P(
|
307 |
"Deduplication is beneficial for LM pretraining in several ways, with the most important being controllable upsampling. With unique data, teams gain fine-grained control over the training data. Other benefits of deduplication include avoiding train-test overlap which prevents evaluation contamination."
|
308 |
),
|
@@ -408,7 +408,7 @@ global_div = Div(
|
|
408 |
),
|
409 |
Section(
|
410 |
H2("Personally Identifiable Information Removal"),
|
411 |
-
H3("
|
412 |
P(
|
413 |
"Personally Identifiable Information (PII) refers to any information that can be used to identify an individual, such as names, addresses, phone numbers, email addresses, and social security numbers. PII removal is essential for data privacy and security, as well as for compliance with global regulations. By removing PII from the training data, we can reduce the risk of data breaches and unauthorized access to sensitive information. Additionally, removing PII from training data prevents the models generating that specific PII during inference time."
|
414 |
),
|
|
|
288 |
"This section discusses all details related to deduplication and filterings steps that were uniformly applied to all data. The section is split into the following topic areas: "
|
289 |
),
|
290 |
Ul(
|
291 |
+
Li("Why Global Deduplication", style="margin-bottom: 5px"),
|
292 |
Li(
|
293 |
"TxT360 Deduplication Process and Implementation",
|
294 |
style="margin-bottom: 5px",
|
|
|
302 |
id="section41",
|
303 |
),
|
304 |
Section(
|
305 |
+
H2("Why Global Deduplication"),
|
306 |
P(
|
307 |
"Deduplication is beneficial for LM pretraining in several ways, with the most important being controllable upsampling. With unique data, teams gain fine-grained control over the training data. Other benefits of deduplication include avoiding train-test overlap which prevents evaluation contamination."
|
308 |
),
|
|
|
408 |
),
|
409 |
Section(
|
410 |
H2("Personally Identifiable Information Removal"),
|
411 |
+
H3("Why Personally Identifiable Information Removal"),
|
412 |
P(
|
413 |
"Personally Identifiable Information (PII) refers to any information that can be used to identify an individual, such as names, addresses, phone numbers, email addresses, and social security numbers. PII removal is essential for data privacy and security, as well as for compliance with global regulations. By removing PII from the training data, we can reduce the risk of data breaches and unauthorized access to sensitive information. Additionally, removing PII from training data prevents the models generating that specific PII during inference time."
|
414 |
),
|
main.py
CHANGED
@@ -834,7 +834,7 @@ def intro():
|
|
834 |
Section(
|
835 |
H2("About TxT360"),
|
836 |
P( "TL;DR ",
|
837 |
-
B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.).
|
838 |
)
|
839 |
),
|
840 |
P(
|
@@ -843,22 +843,23 @@ def intro():
|
|
843 |
D_cite(bibtex_key="fineweb"),
|
844 |
D_cite(bibtex_key="c4"),
|
845 |
D_cite(bibtex_key="muennighoff2023scaling"),
|
|
|
846 |
", TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps.",
|
847 |
),
|
848 |
P(
|
849 |
-
"Metadata is stored
|
850 |
D_cite(bibtex_key="dclm"),
|
851 |
"and RedPajama V2,",
|
852 |
D_cite(bibtex_key="redpajama-v2"),
|
853 |
"we present the final deduplicated dataset that is ready to go.",
|
854 |
),
|
855 |
P(
|
856 |
-
"
|
857 |
),
|
858 |
id="section11",
|
859 |
),
|
860 |
Section(
|
861 |
-
H2("
|
862 |
H3(
|
863 |
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
|
864 |
),
|
|
|
834 |
Section(
|
835 |
H2("About TxT360"),
|
836 |
P( "TL;DR ",
|
837 |
+
B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). The large-scale deduplication process and rich metadata stored enables precise control over data distribution. In addition to document selection, TxT360, along with its rich metadata, allows for the assignment of optimal data weights. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T. Furthermore, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a capability not commonly available in previous pre-training datasets."
|
838 |
)
|
839 |
),
|
840 |
P(
|
|
|
843 |
D_cite(bibtex_key="fineweb"),
|
844 |
D_cite(bibtex_key="c4"),
|
845 |
D_cite(bibtex_key="muennighoff2023scaling"),
|
846 |
+
D_cite(bibtex_key="dolma"),
|
847 |
", TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps.",
|
848 |
),
|
849 |
P(
|
850 |
+
"Metadata is stored along the processing stpes, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM",
|
851 |
D_cite(bibtex_key="dclm"),
|
852 |
"and RedPajama V2,",
|
853 |
D_cite(bibtex_key="redpajama-v2"),
|
854 |
"we present the final deduplicated dataset that is ready to go.",
|
855 |
),
|
856 |
P(
|
857 |
+
"In line with our 360° open-source initiative, we’ve documented all implementation details in this blog post and will be open-sourcing the code soon (stay tuned!). We also provide examples of each filter along with the rationale behind every decision, with the goal of informing and inspiring future work."
|
858 |
),
|
859 |
id="section11",
|
860 |
),
|
861 |
Section(
|
862 |
+
H2("Why TxT360"),
|
863 |
H3(
|
864 |
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
|
865 |
),
|
overview.py
CHANGED
@@ -272,12 +272,12 @@ overview_div = Div(
|
|
272 |
H2("Overview"),
|
273 |
H3("What This Section Contains"),
|
274 |
Ul(
|
275 |
-
Li("
|
276 |
Li("The Highlevel Data Process Approach", style = "margin-bottom: 5px"),
|
277 |
Li("Introduction to Global Deduplication", style = "margin-bottom: 5px"),
|
278 |
),
|
279 |
-
H2("
|
280 |
-
H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
|
281 |
P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets. Commonly used pretraining datasets are:"),
|
282 |
Ul(
|
283 |
Li("RefinedWeb - cite", style = "margin-bottom: 5px"),
|
|
|
272 |
H2("Overview"),
|
273 |
H3("What This Section Contains"),
|
274 |
Ul(
|
275 |
+
Li("Why TxT360", style = "margin-bottom: 5px"),
|
276 |
Li("The Highlevel Data Process Approach", style = "margin-bottom: 5px"),
|
277 |
Li("Introduction to Global Deduplication", style = "margin-bottom: 5px"),
|
278 |
),
|
279 |
+
H2("Why TxT360"),
|
280 |
+
H3("TxT360 is the first dataset to combine both crawled web pages and high quality curated data sources commonly used in pretraining."),
|
281 |
P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets. Commonly used pretraining datasets are:"),
|
282 |
Ul(
|
283 |
Li("RefinedWeb - cite", style = "margin-bottom: 5px"),
|