omkarenator commited on
Commit
e384d00
1 Parent(s): ccfaf0a

add distill-style authors, front-matter

Browse files
Files changed (1) hide show
  1. main.py +128 -40
main.py CHANGED
@@ -39,28 +39,86 @@ app, rt = fast_app(
39
  )
40
 
41
 
42
- front_matter = """
43
- <d-front-matter>
44
- <script id='distill-front-matter' type="text/json">{
45
- "title": "",
46
- "description": "",
47
- "published": "",
48
- "affiliation": {},
49
  "authors": [
50
- {
51
- "author":"",
52
- "authorURL":""
53
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  ],
55
- "katex": {
56
- "delimiters": [
57
- {"left": "$$", "right": "$$", "display": false}
58
- ]
59
- }
60
- }
61
- </script>
62
- </d-front-matter>
63
- """
64
 
65
 
66
  def read_bibs():
@@ -78,6 +136,8 @@ def get():
78
 
79
  @app.get("/")
80
  def main():
 
 
81
  return Div(
82
  D_title(
83
  H1(
@@ -91,7 +151,14 @@ def main():
91
  cls="main-plot-container l-page",
92
  ),
93
  ),
94
- Div(D_byline(), NotStr(front_matter), style="display: none;"),
 
 
 
 
 
 
 
95
  D_article(
96
  D_contents(
97
  Nav(
@@ -358,7 +425,6 @@ new_dataset_comparison1 = pd.DataFrame(
358
  "EuroParl",
359
  "StackExchange",
360
  "Code",
361
-
362
  ],
363
  "TxT360": [
364
  "99",
@@ -451,7 +517,7 @@ new_dataset_comparison1 = pd.DataFrame(
451
  "",
452
  " ",
453
  "",
454
- "Included",
455
  "-",
456
  "-",
457
  "-",
@@ -473,16 +539,18 @@ new_dataset_comparison1 = pd.DataFrame(
473
  "Included",
474
  ],
475
  }
476
- )
477
 
478
  styled_table = (
479
  new_dataset_comparison1.style.applymap(
480
  lambda _: "background-color: #E1EEDB", # Green background for col 1
481
- subset=pd.IndexSlice[:, "TxT360"]
482
  )
483
  .applymap(
484
  lambda _: "background-color: white", # White background for all other columns
485
- subset=pd.IndexSlice[:, new_dataset_comparison1.columns.difference(["TxT360"])] # Apply to all columns except "TxT360"
 
 
486
  )
487
  .hide(axis="index") # Hide the row index
488
  )
@@ -762,7 +830,14 @@ styled_table = (
762
  .set_properties(**{"text-align": "center"}) # Center the text in all cells
763
  .set_table_styles(
764
  [
765
- {"selector": "table", "props": [("margin-left", "20%"), ("margin-right", "auto"), ("width", "100%")]}, # Center the table and adjust width
 
 
 
 
 
 
 
766
  ]
767
  )
768
  .hide(axis="index") # Hide the row index
@@ -770,7 +845,9 @@ styled_table = (
770
 
771
  table_html_data = styled_table._repr_html_()
772
  # table_html_data = dataset_sources.to_html(index=False, border=0)
773
- table_div_data = Div(NotStr(table_html_data), style="margin-left: auto; width: 80%; align: center;")
 
 
774
 
775
 
776
  @app.get("/intro")
@@ -779,15 +856,24 @@ def intro():
779
  Section(
780
  H2("About TxT360"),
781
  P(
782
- B("We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models.")
 
 
783
  ),
784
  P(
785
  "Building on top of the prior studies on pre-training data,",
786
- D_cite(bibtex_key="refinedweb"), D_cite(bibtex_key="fineweb"), D_cite(bibtex_key="c4"), D_cite(bibtex_key="muennighoff2023scaling"),
787
- "TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps."
 
 
 
788
  ),
789
  P(
790
- "Metadata is stored to recover the raw distribution for each dataset, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM", D_cite(bibtex_key="dclm"), "and RedPajama V2,", D_cite(bibtex_key="redpajama-v2"), "we present the final deduplicated dataset that is ready to go."
 
 
 
 
791
  ),
792
  P(
793
  "We documented all implementation details in this blog post and are open sourcing the code. Examples of each filter and rationale supporting each decision are included."
@@ -800,14 +886,16 @@ def intro():
800
  "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
801
  ),
802
  new_table_div_1,
803
- #table_div_1,
804
- #table_div_2,
805
  P(
806
  "In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
807
  ),
808
- P("** TxT360 does not include code. This decision was made due to the perceived low duplication code with other sources."),
809
- #P("Table 2: Basic TxT360 Statistics."),
810
- #table_div_data,
 
 
811
  id="section2",
812
  ),
813
  Section(
@@ -825,10 +913,10 @@ def intro():
825
  P(
826
  "We provide details and context for the choices behind TxT360 in the respective Web Data Processing and Curated Source Processing section. A deep dive describing the deduplication process can be found in the Commonly Applied Processing Steps section."
827
  ),
828
- #Img(src="images/pipeline.png", height="300", width="600"),
829
- #P(
830
  # "Figure 1: Data processing pipeline. All the steps are adopted for processing web data while the yellow blocks are adopted for processing curated sources."
831
- #),
832
  id="section3",
833
  ),
834
  id="inner-text",
 
39
  )
40
 
41
 
42
+ front_matter = {
43
+ "title": "TxT360",
44
+ "description": "A globally deduplicated dataset for LLM pretraining",
45
+ "published": "October 7, 2024",
 
 
 
46
  "authors": [
47
+ {
48
+ "author": "Liping Tang",
49
+ "authorURL": "https://huggingface.co/Liping",
50
+ "affiliation": "MBZUAI",
51
+ "affiliationURL": "LLM360.ai",
52
+ },
53
+ {
54
+ "author": "Nikhil Ranjan",
55
+ "authorURL": "https://huggingface.co/NikhilRanjan",
56
+ "affiliation": "MBZUAI",
57
+ "affiliationURL": "",
58
+ },
59
+ {
60
+ "author": "Omkar Pangarkar",
61
+ "authorURL": "https://huggingface.co/omkarenator",
62
+ "affiliation": "Petuum, Inc.",
63
+ "affiliationURL": "",
64
+ },
65
+ {
66
+ "author": "Zhen Wang",
67
+ "authorURL": "https://huggingface.co/ZhenWang",
68
+ "affiliation": "MBZUAI",
69
+ "affiliationURL": "",
70
+ },
71
+ {
72
+ "author": "An Li",
73
+ "authorURL": "https://huggingface.co/AnLi",
74
+ "affiliation": "",
75
+ "affiliationURL": "",
76
+ },
77
+ {
78
+ "author": "Zhoujun Cheng",
79
+ "authorURL": "https://huggingface.co/ZhoujunCheng",
80
+ "affiliation": "",
81
+ "affiliationURL": "",
82
+ },
83
+ {
84
+ "author": "Suqi Sun",
85
+ "authorURL": "https://huggingface.co/SuqiSun",
86
+ "affiliation": "Petuum, Inc.",
87
+ "affiliationURL": "",
88
+ },
89
+ {
90
+ "author": "Cun Mu",
91
+ "authorURL": "https://huggingface.co/CunMu",
92
+ "affiliation": "",
93
+ "affiliationURL": "",
94
+ },
95
+ {
96
+ "author": "Victor Miller",
97
+ "authorURL": "https://huggingface.co/VictorMiller",
98
+ "affiliation": "",
99
+ "affiliationURL": "",
100
+ },
101
+ {
102
+ "author": "Yue Peng",
103
+ "authorURL": "https://huggingface.co/YuePeng",
104
+ "affiliation": "",
105
+ "affiliationURL": "",
106
+ },
107
+ {
108
+ "author": "Eric P. Xing",
109
+ "authorURL": "https://huggingface.co/EricXing",
110
+ "affiliation": "MBZUAI & CMU",
111
+ "affiliationURL": "https://www.mbzuai.ac.ae/ & https://www.cs.cmu.edu/",
112
+ },
113
+ {
114
+ "author": "Zhengzhong Liu",
115
+ "authorURL": "https://huggingface.co/ZhengzhongLiu",
116
+ "affiliation": "",
117
+ "affiliationURL": "",
118
+ },
119
  ],
120
+ "katex": {"delimiters": [{"left": "$$", "right": "$$", "display": "false"}]},
121
+ }
 
 
 
 
 
 
 
122
 
123
 
124
  def read_bibs():
 
136
 
137
  @app.get("/")
138
  def main():
139
+ from fasthtml.xtend import Script
140
+
141
  return Div(
142
  D_title(
143
  H1(
 
151
  cls="main-plot-container l-page",
152
  ),
153
  ),
154
+ D_byline(),
155
+ D_front_matter(
156
+ Script(
157
+ json.dumps(front_matter),
158
+ id="distill-front-matter",
159
+ type="text/json",
160
+ )
161
+ ),
162
  D_article(
163
  D_contents(
164
  Nav(
 
425
  "EuroParl",
426
  "StackExchange",
427
  "Code",
 
428
  ],
429
  "TxT360": [
430
  "99",
 
517
  "",
518
  " ",
519
  "",
520
+ "Included",
521
  "-",
522
  "-",
523
  "-",
 
539
  "Included",
540
  ],
541
  }
542
+ )
543
 
544
  styled_table = (
545
  new_dataset_comparison1.style.applymap(
546
  lambda _: "background-color: #E1EEDB", # Green background for col 1
547
+ subset=pd.IndexSlice[:, "TxT360"],
548
  )
549
  .applymap(
550
  lambda _: "background-color: white", # White background for all other columns
551
+ subset=pd.IndexSlice[
552
+ :, new_dataset_comparison1.columns.difference(["TxT360"])
553
+ ], # Apply to all columns except "TxT360"
554
  )
555
  .hide(axis="index") # Hide the row index
556
  )
 
830
  .set_properties(**{"text-align": "center"}) # Center the text in all cells
831
  .set_table_styles(
832
  [
833
+ {
834
+ "selector": "table",
835
+ "props": [
836
+ ("margin-left", "20%"),
837
+ ("margin-right", "auto"),
838
+ ("width", "100%"),
839
+ ],
840
+ }, # Center the table and adjust width
841
  ]
842
  )
843
  .hide(axis="index") # Hide the row index
 
845
 
846
  table_html_data = styled_table._repr_html_()
847
  # table_html_data = dataset_sources.to_html(index=False, border=0)
848
+ table_div_data = Div(
849
+ NotStr(table_html_data), style="margin-left: auto; width: 80%; align: center;"
850
+ )
851
 
852
 
853
  @app.get("/intro")
 
856
  Section(
857
  H2("About TxT360"),
858
  P(
859
+ B(
860
+ "We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models."
861
+ )
862
  ),
863
  P(
864
  "Building on top of the prior studies on pre-training data,",
865
+ D_cite(bibtex_key="refinedweb"),
866
+ D_cite(bibtex_key="fineweb"),
867
+ D_cite(bibtex_key="c4"),
868
+ D_cite(bibtex_key="muennighoff2023scaling"),
869
+ "TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps.",
870
  ),
871
  P(
872
+ "Metadata is stored to recover the raw distribution for each dataset, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM",
873
+ D_cite(bibtex_key="dclm"),
874
+ "and RedPajama V2,",
875
+ D_cite(bibtex_key="redpajama-v2"),
876
+ "we present the final deduplicated dataset that is ready to go.",
877
  ),
878
  P(
879
  "We documented all implementation details in this blog post and are open sourcing the code. Examples of each filter and rationale supporting each decision are included."
 
886
  "TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
887
  ),
888
  new_table_div_1,
889
+ # table_div_1,
890
+ # table_div_2,
891
  P(
892
  "In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Combining both datasets plays a critical role for effective LLM pre-training. By integrating the reach of web data with the quality of curated sources, TxT360 meets and surpasses the rigorous standards required for state-of-the-art LLM pre-training. See Results section below."
893
  ),
894
+ P(
895
+ "** TxT360 does not include code. This decision was made due to the perceived low duplication code with other sources."
896
+ ),
897
+ # P("Table 2: Basic TxT360 Statistics."),
898
+ # table_div_data,
899
  id="section2",
900
  ),
901
  Section(
 
913
  P(
914
  "We provide details and context for the choices behind TxT360 in the respective Web Data Processing and Curated Source Processing section. A deep dive describing the deduplication process can be found in the Commonly Applied Processing Steps section."
915
  ),
916
+ # Img(src="images/pipeline.png", height="300", width="600"),
917
+ # P(
918
  # "Figure 1: Data processing pipeline. All the steps are adopted for processing web data while the yellow blocks are adopted for processing curated sources."
919
+ # ),
920
  id="section3",
921
  ),
922
  id="inner-text",