hunterhector commited on
Commit
ac7d8cf
·
1 Parent(s): d098e08
Files changed (3) hide show
  1. common.py +3 -3
  2. main.py +5 -4
  3. overview.py +3 -3
common.py CHANGED
@@ -288,7 +288,7 @@ global_div = Div(
288
  "This section discusses all details related to deduplication and filterings steps that were uniformly applied to all data. The section is split into the following topic areas: "
289
  ),
290
  Ul(
291
- Li("Motivation Behind Global Deduplication", style="margin-bottom: 5px"),
292
  Li(
293
  "TxT360 Deduplication Process and Implementation",
294
  style="margin-bottom: 5px",
@@ -302,7 +302,7 @@ global_div = Div(
302
  id="section41",
303
  ),
304
  Section(
305
- H2("Motivation Behind Global Deduplication"),
306
  P(
307
  "Deduplication is beneficial for LM pretraining in several ways, with the most important being controllable upsampling. With unique data, teams gain fine-grained control over the training data. Other benefits of deduplication include avoiding train-test overlap which prevents evaluation contamination."
308
  ),
@@ -408,7 +408,7 @@ global_div = Div(
408
  ),
409
  Section(
410
  H2("Personally Identifiable Information Removal"),
411
- H3("Motivation Behind Personally Identifiable Information Removal"),
412
  P(
413
  "Personally Identifiable Information (PII) refers to any information that can be used to identify an individual, such as names, addresses, phone numbers, email addresses, and social security numbers. PII removal is essential for data privacy and security, as well as for compliance with global regulations. By removing PII from the training data, we can reduce the risk of data breaches and unauthorized access to sensitive information. Additionally, removing PII from training data prevents the models generating that specific PII during inference time."
414
  ),
 
288
  "This section discusses all details related to deduplication and filterings steps that were uniformly applied to all data. The section is split into the following topic areas: "
289
  ),
290
  Ul(
291
+ Li("Why Global Deduplication", style="margin-bottom: 5px"),
292
  Li(
293
  "TxT360 Deduplication Process and Implementation",
294
  style="margin-bottom: 5px",
 
302
  id="section41",
303
  ),
304
  Section(
305
+ H2("Why Global Deduplication"),
306
  P(
307
  "Deduplication is beneficial for LM pretraining in several ways, with the most important being controllable upsampling. With unique data, teams gain fine-grained control over the training data. Other benefits of deduplication include avoiding train-test overlap which prevents evaluation contamination."
308
  ),
 
408
  ),
409
  Section(
410
  H2("Personally Identifiable Information Removal"),
411
+ H3("Why Personally Identifiable Information Removal"),
412
  P(
413
  "Personally Identifiable Information (PII) refers to any information that can be used to identify an individual, such as names, addresses, phone numbers, email addresses, and social security numbers. PII removal is essential for data privacy and security, as well as for compliance with global regulations. By removing PII from the training data, we can reduce the risk of data breaches and unauthorized access to sensitive information. Additionally, removing PII from training data prevents the models generating that specific PII during inference time."
414
  ),
main.py CHANGED
@@ -834,7 +834,7 @@ def intro():
834
  Section(
835
  H2("About TxT360"),
836
  P( "TL;DR ",
837
- B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). Our large-scale deduplication process enables precise control over data weighting. In addition to document selection, TxT360, along with its rich metadata, allows for the assignment of optimal data weights. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T. Furthermore, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a capability not commonly available in previous pre-training datasets."
838
  )
839
  ),
840
  P(
@@ -843,22 +843,23 @@ def intro():
843
  D_cite(bibtex_key="fineweb"),
844
  D_cite(bibtex_key="c4"),
845
  D_cite(bibtex_key="muennighoff2023scaling"),
 
846
  ", TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps.",
847
  ),
848
  P(
849
- "Metadata is stored to recover the raw distribution for each dataset, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM",
850
  D_cite(bibtex_key="dclm"),
851
  "and RedPajama V2,",
852
  D_cite(bibtex_key="redpajama-v2"),
853
  "we present the final deduplicated dataset that is ready to go.",
854
  ),
855
  P(
856
- "We documented all implementation details in this blog post and are open sourcing the code. Examples of each filter and rationale supporting each decision are included."
857
  ),
858
  id="section11",
859
  ),
860
  Section(
861
- H2("Motivation Behind TxT360"),
862
  H3(
863
  "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
864
  ),
 
834
  Section(
835
  H2("About TxT360"),
836
  P( "TL;DR ",
837
+ B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). The large-scale deduplication process and rich metadata stored enables precise control over data distribution. In addition to document selection, TxT360, along with its rich metadata, allows for the assignment of optimal data weights. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T. Furthermore, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a capability not commonly available in previous pre-training datasets."
838
  )
839
  ),
840
  P(
 
843
  D_cite(bibtex_key="fineweb"),
844
  D_cite(bibtex_key="c4"),
845
  D_cite(bibtex_key="muennighoff2023scaling"),
846
+ D_cite(bibtex_key="dolma"),
847
  ", TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps.",
848
  ),
849
  P(
850
+ "Metadata is stored along the processing stpes, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM",
851
  D_cite(bibtex_key="dclm"),
852
  "and RedPajama V2,",
853
  D_cite(bibtex_key="redpajama-v2"),
854
  "we present the final deduplicated dataset that is ready to go.",
855
  ),
856
  P(
857
+ "In line with our 360° open-source initiative, we’ve documented all implementation details in this blog post and will be open-sourcing the code soon (stay tuned!). We also provide examples of each filter along with the rationale behind every decision, with the goal of informing and inspiring future work."
858
  ),
859
  id="section11",
860
  ),
861
  Section(
862
+ H2("Why TxT360"),
863
  H3(
864
  "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
865
  ),
overview.py CHANGED
@@ -272,12 +272,12 @@ overview_div = Div(
272
  H2("Overview"),
273
  H3("What This Section Contains"),
274
  Ul(
275
- Li("Motivation Behind TxT360", style = "margin-bottom: 5px"),
276
  Li("The Highlevel Data Process Approach", style = "margin-bottom: 5px"),
277
  Li("Introduction to Global Deduplication", style = "margin-bottom: 5px"),
278
  ),
279
- H2("Motivation Behind TxT360"),
280
- H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
281
  P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets. Commonly used pretraining datasets are:"),
282
  Ul(
283
  Li("RefinedWeb - cite", style = "margin-bottom: 5px"),
 
272
  H2("Overview"),
273
  H3("What This Section Contains"),
274
  Ul(
275
+ Li("Why TxT360", style = "margin-bottom: 5px"),
276
  Li("The Highlevel Data Process Approach", style = "margin-bottom: 5px"),
277
  Li("Introduction to Global Deduplication", style = "margin-bottom: 5px"),
278
  ),
279
+ H2("Why TxT360"),
280
+ H3("TxT360 is the first dataset to combine both crawled web pages and high quality curated data sources commonly used in pretraining."),
281
  P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets. Commonly used pretraining datasets are:"),
282
  Ul(
283
  Li("RefinedWeb - cite", style = "margin-bottom: 5px"),