victormiller
commited on
Commit
•
715785a
1
Parent(s):
0e26631
Update web.py
Browse files
web.py
CHANGED
@@ -72,6 +72,54 @@ data_filtering_table_data = pd.DataFrame(
|
|
72 |
"No",
|
73 |
"No",
|
74 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
"QF: ML-based": [
|
76 |
"No",
|
77 |
"No",
|
@@ -122,40 +170,11 @@ data_filtering_table_data = pd.DataFrame(
|
|
122 |
"No",
|
123 |
"No",
|
124 |
],
|
125 |
-
|
126 |
-
"Yes",
|
127 |
-
"Yes",
|
128 |
-
"No",
|
129 |
-
"No",
|
130 |
-
"No",
|
131 |
-
"Yes",
|
132 |
-
"No",
|
133 |
-
"No",
|
134 |
-
],
|
135 |
-
"Exact Deduplication": [
|
136 |
-
"Bloom Filter",
|
137 |
-
"n/a",
|
138 |
-
"ExactSubStr",
|
139 |
-
"Bloom Filter",
|
140 |
-
"n/a",
|
141 |
-
"Bloom Filter",
|
142 |
-
"n/a",
|
143 |
-
"n/a",
|
144 |
-
],
|
145 |
-
"Fuzzy Deduplication": [
|
146 |
-
"Global",
|
147 |
-
"Local",
|
148 |
-
"Local",
|
149 |
-
"Local",
|
150 |
-
"Local",
|
151 |
-
"Local",
|
152 |
-
"Local",
|
153 |
-
"Global",
|
154 |
-
],
|
155 |
-
}
|
156 |
)
|
157 |
-
|
158 |
-
|
|
|
159 |
|
160 |
def DVS(
|
161 |
left,
|
@@ -366,6 +385,8 @@ def web_data():
|
|
366 |
H2("Web Data Processing Summary"),
|
367 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
368 |
table_div_filter_data,
|
|
|
|
|
369 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
370 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
371 |
P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
|
|
|
72 |
"No",
|
73 |
"No",
|
74 |
],
|
75 |
+
"PII Filtering": [
|
76 |
+
"Yes",
|
77 |
+
"Yes",
|
78 |
+
"No",
|
79 |
+
"No",
|
80 |
+
"No",
|
81 |
+
"Yes",
|
82 |
+
"No",
|
83 |
+
"No",
|
84 |
+
],
|
85 |
+
"Exact Deduplication": [
|
86 |
+
"Bloom Filter",
|
87 |
+
"n/a",
|
88 |
+
"ExactSubStr",
|
89 |
+
"Bloom Filter",
|
90 |
+
"n/a",
|
91 |
+
"Bloom Filter",
|
92 |
+
"n/a",
|
93 |
+
"n/a",
|
94 |
+
],
|
95 |
+
"Fuzzy Deduplication": [
|
96 |
+
"Global",
|
97 |
+
"Local",
|
98 |
+
"Local",
|
99 |
+
"Local",
|
100 |
+
"Local",
|
101 |
+
"Local",
|
102 |
+
"Local",
|
103 |
+
"Global",
|
104 |
+
],
|
105 |
+
}
|
106 |
+
)
|
107 |
+
table_html_filter_data = data_filtering_table_data.to_html(index=False, border=0)
|
108 |
+
table_div_filter_data = Div(NotStr(table_html_filter_data), style="margin: 40px;")
|
109 |
+
|
110 |
+
|
111 |
+
qf_filtering_table_data = pd.DataFrame(
|
112 |
+
{
|
113 |
+
"Dataset": [
|
114 |
+
"TxT360",
|
115 |
+
"FineWeb",
|
116 |
+
"RefinedWeb",
|
117 |
+
"RedPajamaV2",
|
118 |
+
"C4",
|
119 |
+
"Dolma",
|
120 |
+
"RedPajamaV1",
|
121 |
+
"The Pile",
|
122 |
+
],
|
123 |
"QF: ML-based": [
|
124 |
"No",
|
125 |
"No",
|
|
|
170 |
"No",
|
171 |
"No",
|
172 |
],
|
173 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
)
|
175 |
+
table_html_qf_filter_data = qf_filtering_table_data.to_html(index=False, border=0)
|
176 |
+
table_div_qf_filter_data = Div(NotStr(table_html_qf_filter_data), style="margin: 40px;")
|
177 |
+
|
178 |
|
179 |
def DVS(
|
180 |
left,
|
|
|
385 |
H2("Web Data Processing Summary"),
|
386 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
387 |
table_div_filter_data,
|
388 |
+
P("ADD EXPLAINER TEXT ABOUT THE QUALITY FILTERS"),
|
389 |
+
table_div_qf_filter_data,
|
390 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
391 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
392 |
P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
|