Spaces:
Running
Running
omkarenator
commited on
Merge branch 'main' of hf.co:spaces/LLM360/TxT360-New
Browse files- curated.py +350 -8
- main.py +328 -47
- results.py +27 -30
curated.py
CHANGED
@@ -9,12 +9,355 @@ from rich import print
|
|
9 |
import uuid
|
10 |
import plotly.express as px
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
|
13 |
copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
|
14 |
|
15 |
local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
|
16 |
|
17 |
-
|
18 |
treemap_data = {
|
19 |
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
20 |
'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
|
@@ -467,23 +810,22 @@ def curated(request):
|
|
467 |
table_html = preprocessing_steps.to_html(index=False, border=0)
|
468 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
469 |
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
|
470 |
-
|
471 |
return Div(
|
472 |
-
Section(
|
473 |
H2("Curated Sources: Overview"),
|
474 |
overview_text,
|
475 |
copyright_disclaimer,
|
476 |
plotly2fasthtml(treemap_chart),
|
477 |
table_desc,
|
478 |
-
H2("Curated Sources
|
479 |
-
|
480 |
data_preparation_div,
|
481 |
H3("Data Filtering"),
|
482 |
data_preprocessing_div,
|
483 |
plotly2fasthtml(get_chart_28168342()),
|
484 |
H2("Local Deduplication"),
|
485 |
local_dedup_text,
|
486 |
-
table_div_data_pipe,
|
487 |
id="inner-text",
|
488 |
-
|
489 |
-
|
|
|
9 |
import uuid
|
10 |
import plotly.express as px
|
11 |
|
12 |
+
filtering_process = Div(
|
13 |
+
Section(
|
14 |
+
P("This section contains the specific steps taken to filter all 14 curated source datasets.")
|
15 |
+
),
|
16 |
+
Section(
|
17 |
+
H3("Wikipedia"),
|
18 |
+
H4("Download and Extraction"),
|
19 |
+
Ol(
|
20 |
+
Li("Downloaded from Wikimedia official dump of wikipedia on huggingface https://huggingface.co/datasets/wikimedia/wikipedia/tree/main"),
|
21 |
+
Li("Data is originally in parqet format so we use huggingface dataset.to_json function to convert it to the jsonl format"),
|
22 |
+
),
|
23 |
+
H4("Filtering"),
|
24 |
+
Ol(
|
25 |
+
Li("As we expect the dataset to be already of high quality so only one filter is applied which is to remove all documents (articles) with less than 10 words (not inclusive)"),
|
26 |
+
),
|
27 |
+
H4("Local Deduplication Process"),
|
28 |
+
Ol(
|
29 |
+
Li("Whole wikipedia was deduped using minhash generation following Slim pajama code"),
|
30 |
+
),
|
31 |
+
H4("Global Deduplication Process"),
|
32 |
+
Ol(
|
33 |
+
Li("After local dedup, remaining wikipedia was deduped again with all the datasets combined"),
|
34 |
+
),
|
35 |
+
|
36 |
+
),
|
37 |
+
Section(
|
38 |
+
H3("ArXiv"),
|
39 |
+
H4("Download and Extraction"),
|
40 |
+
Ol(
|
41 |
+
Li("All the data was downloaded in original latex format from Arxiv official S3 dump s3://arxic/src"),
|
42 |
+
Li("We try to encode the downloaded data into utf-8 or guess encoding using chardet library"),
|
43 |
+
Li("After that pandoc was used to extract information from the latex files and saved as markdown format - code: pandoc -s {tex} -o out/{out_name}.md --wrap=none"),
|
44 |
+
Li("All markdowns were combined to create jsonl files"),
|
45 |
+
),
|
46 |
+
H4("Filtering"),
|
47 |
+
P("Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
|
48 |
+
Ol(
|
49 |
+
Li("min_word: less than 500 words (not inclusive) are discarded"),
|
50 |
+
Li("Language: any language other than English are discarded"),
|
51 |
+
Li("Frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
52 |
+
Li("Unigram log probablity: Must have higher than -20 average unigram log probability. To calculate the average log word probability, we use word frequencies extracted from the 1T Web Ngram corpus; specifically, we use the list available created by Rachel Tatman. A copy is hosted here."),
|
53 |
+
Li("number 4 above had hyperlinks that need to be included"),
|
54 |
+
),
|
55 |
+
H4("Local Deduplication Process"),
|
56 |
+
Ol(
|
57 |
+
Li("Local dedup was done with all papers combined."),
|
58 |
+
),
|
59 |
+
H4("Global Deduplication Process"),
|
60 |
+
Ol(
|
61 |
+
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
62 |
+
),
|
63 |
+
|
64 |
+
),
|
65 |
+
Section(
|
66 |
+
H3("S2ORC"),
|
67 |
+
H4("Download and Extraction"),
|
68 |
+
Ol(
|
69 |
+
Li("This was downloaded directly in zip format using S2ORC api key and normal get request. code: response = urllib.request.urlopen(url)"),
|
70 |
+
Li("There were two kind of datasets that was downloaded S2ORC and S2ORC abstract"),
|
71 |
+
),
|
72 |
+
H4("Filtering - S2ORC"),
|
73 |
+
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset"),
|
74 |
+
Ol(
|
75 |
+
Li("title_abstract: must have title and abstract"),
|
76 |
+
Li("The paper must be in English. To determine the language of each document, we use the pycld3 library. We run pycld3 on the first 2000 characters of each paragraph in the paper. The language of the paper is the most common language of the paragraphs."),
|
77 |
+
Li("word_count: less than 500 words (not inclusive) are discarded"),
|
78 |
+
Li("paragraph_count: The paper must have at least 5 paragraphs after removing paragraphs with less than -20 average log world probability"),
|
79 |
+
Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
80 |
+
),
|
81 |
+
H4("Filtering - S2ORC Abstract"),
|
82 |
+
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset. The frequency filter was not used as suggested by peS2o because it was removing good samples as inspected manually"),
|
83 |
+
Ol(
|
84 |
+
Li("title_abstract: must have title and abstract"),
|
85 |
+
Li("language: abstract must be in English"),
|
86 |
+
Li("word_count: less than 20 (not inclusive) are discarded"),
|
87 |
+
Li("Unigram log probablity"),
|
88 |
+
Li("frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
89 |
+
),
|
90 |
+
H4("Local Deduplication Process"),
|
91 |
+
Ol(
|
92 |
+
Li("Local dedup was done with all papers combined."),
|
93 |
+
),
|
94 |
+
H4("Global Deduplication Process"),
|
95 |
+
Ol(
|
96 |
+
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
|
97 |
+
),
|
98 |
+
|
99 |
+
),
|
100 |
+
Section(
|
101 |
+
H3("PubMed"),
|
102 |
+
H4("Download and Extraction"),
|
103 |
+
Ol(
|
104 |
+
Li("First all the urls of PMC and PMA files are parsed and stored as text file from FTP server https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),
|
105 |
+
Li("All the urls are downloaded and the downloaded data is in xml.tar format"),
|
106 |
+
Li("For pubmed central First tar files are opened using tarfile library and then converted to markdown format using pandoc: pandoc -f jats {nxml} -o {pmcid}.md --wrap=none"),
|
107 |
+
Li("All the markdown files are combined to create jsonl files. In jsonl files, 1 line correspond to 1 markdown file."),
|
108 |
+
Li("For pubmed abstract, the XML files are in very simple format and beautiful soup is directly used to extract the abstract, title and pmid and stored in jsonl format"),
|
109 |
+
),
|
110 |
+
H4("Filtering"),
|
111 |
+
P("1. Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
|
112 |
+
Ol(
|
113 |
+
Li("min_word: less than 100 words (not inclusive) are discarded, less than 20 words for pubmed abstract"),
|
114 |
+
Li("Language: any language other than English are discarded"),
|
115 |
+
Li("Frequency: The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace. This filter is not used for pubmed abstract"),
|
116 |
+
Li("Unigram log probablity: Must have higher than -20 average unigram log probability. To calculate the average log word probability, we use word frequencies extracted from the 1T Web Ngram corpus; specifically, we use the list available created by Rachel Tatman. A copy is hosted here."),
|
117 |
+
Li("need to add the hyperlinks for the section above"),
|
118 |
+
),
|
119 |
+
H4("Local Deduplication Process"),
|
120 |
+
Ol(
|
121 |
+
Li("Local dedup was done with all papers combined."),
|
122 |
+
),
|
123 |
+
H4("Global Deduplication Process"),
|
124 |
+
Ol(
|
125 |
+
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
126 |
+
),
|
127 |
+
|
128 |
+
),
|
129 |
+
Section(
|
130 |
+
H3("Phil Papers"),
|
131 |
+
H4("Download and Extraction"),
|
132 |
+
Ol(
|
133 |
+
Li("Original pdf files download location was downloaded from https://philarchive.org/oai.pl "),
|
134 |
+
Li("All pdf files were downloaded"),
|
135 |
+
Li("Pdf was converted to text using java -jar ../philpapers_resources/src/pdfbox-app-2.0.21.jar ExtractText {f0} {FOUT.name}"),
|
136 |
+
Li("Language was detected and added using langdetect library"),
|
137 |
+
),
|
138 |
+
H4("Filtering"),
|
139 |
+
Ol(
|
140 |
+
Li("Many filters were used to clean the phil papers like double whitespaces, new lines etc. All filter details are here: https://github.com/thoppe/The-Pile-PhilPapers/blob/master/pdf_filter.py"),
|
141 |
+
),
|
142 |
+
H4("Local Deduplication Process"),
|
143 |
+
Ol(
|
144 |
+
Li("Local dedup was done with all papers combined."),
|
145 |
+
),
|
146 |
+
H4("Global Deduplication Process"),
|
147 |
+
Ol(
|
148 |
+
Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
|
149 |
+
),
|
150 |
+
|
151 |
+
),
|
152 |
+
Section(
|
153 |
+
H3("Europarl"),
|
154 |
+
H4("Download and Extraction"),
|
155 |
+
Ol(
|
156 |
+
Li("Original data was downloaded from http://www.statmt.org/europarl/v7/europarl.tgz"),
|
157 |
+
Li("Finally the remaining files are converted to jsonl lines"),
|
158 |
+
),
|
159 |
+
H4("Filtering"),
|
160 |
+
Ol(
|
161 |
+
Li("Smaller than 200 characters of documents are removed while downloading so no others filtered were run"),
|
162 |
+
Li("Tags were also removed while downloading"),
|
163 |
+
),
|
164 |
+
H4("Local Deduplication Process"),
|
165 |
+
Ol(
|
166 |
+
Li("Local dedup was done within europarl itself"),
|
167 |
+
),
|
168 |
+
H4("Global Deduplication Process"),
|
169 |
+
Ol(
|
170 |
+
Li("After local dedup, remaining europarl was deduped again with all the datasets combined"),
|
171 |
+
),
|
172 |
+
|
173 |
+
),
|
174 |
+
Section(
|
175 |
+
H3("HackerNews"),
|
176 |
+
H4("Download and Extraction"),
|
177 |
+
Ol(
|
178 |
+
Li("Data was parsed using hackernews story ids starting using https://hacker-news.firebaseio.com/v0/item/"),
|
179 |
+
Li("Story ids was started from 1 till 37500000 (all stories that gives error while pinging the url was removed). Each post is a story, with each reply another story"),
|
180 |
+
Li("As there were too many requests error, there was a wait(2 sec) statement included in the code"),
|
181 |
+
Li("As the number of stories were large and containing all the replies was time consuming and possibility of introducing too much error, only longest depth threads were included from 3rd level onwards. So we include the title then all the replies (2nd level) but replies to those replies (3rd level) were only the ones which has maximum depth."),
|
182 |
+
),
|
183 |
+
H4("Filtering"),
|
184 |
+
Ol(
|
185 |
+
Li("Min word: 10"),
|
186 |
+
Li("Language: Only english"),
|
187 |
+
Li("Unigram log probablity"),
|
188 |
+
),
|
189 |
+
H4("Local Deduplication Process"),
|
190 |
+
Ol(
|
191 |
+
Li("Local dedup was done within hackernews itself"),
|
192 |
+
),
|
193 |
+
H4("Global Deduplication Process"),
|
194 |
+
Ol(
|
195 |
+
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
196 |
+
),
|
197 |
+
|
198 |
+
),
|
199 |
+
Section(
|
200 |
+
H3("USPTO"),
|
201 |
+
H4("Download and Extraction"),
|
202 |
+
Ol(
|
203 |
+
Li("Data was downloaded and extracted using tags from https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"),
|
204 |
+
Li("There were three different format that needed three different functions to download and extract the data based on year: Pre_2002, 2002_to_2004, post_2004"),
|
205 |
+
|
206 |
+
),
|
207 |
+
H4("Filtering"),
|
208 |
+
Ol(
|
209 |
+
Li("Min word: 50"),
|
210 |
+
Li("Language: Only english"),
|
211 |
+
Li("Unigram log probablity"),
|
212 |
+
),
|
213 |
+
H4("Local Deduplication Process"),
|
214 |
+
Ol(
|
215 |
+
Li("Local dedup was done within USPTO itself"),
|
216 |
+
),
|
217 |
+
H4("Global Deduplication Process"),
|
218 |
+
Ol(
|
219 |
+
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
220 |
+
),
|
221 |
+
|
222 |
+
),
|
223 |
+
Section(
|
224 |
+
H3("FreeLaw"),
|
225 |
+
H4("Download and Extraction"),
|
226 |
+
Ol(
|
227 |
+
Li("CSV format bulk data was downloaded from https://storage.courtlistener.com/bulk-data/"),
|
228 |
+
Li("They have multiple dumps as shown below with lot of duplicates (exact number is given in the table at the top)"),
|
229 |
+
Li("there is an image to show here!"),
|
230 |
+
Li("As these are csv files, they have multiple columns where text can be present, so we extracted text from the following columns using html2text function which just convert and extract tags from html tags"),
|
231 |
+
Li("image to show"),
|
232 |
+
Li("Text was also extracted from row named 'plain_text'"),
|
233 |
+
Li("Priority is always given to plain_text first then from 6 to 1 in the subsequent order following pile logic"),
|
234 |
+
),
|
235 |
+
H4("Filtering"),
|
236 |
+
Ol(
|
237 |
+
Li("Min word: 50"),
|
238 |
+
Li("Language: Only english"),
|
239 |
+
Li("Unigram log probablity"),
|
240 |
+
),
|
241 |
+
H4("Local Deduplication Process"),
|
242 |
+
Ol(
|
243 |
+
Li("Local dedup was done within freelaw itself which removed 90%+ duplicates"),
|
244 |
+
),
|
245 |
+
H4("Global Deduplication Process"),
|
246 |
+
Ol(
|
247 |
+
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
248 |
+
),
|
249 |
+
|
250 |
+
),
|
251 |
+
Section(
|
252 |
+
H3("StackExchange"),
|
253 |
+
H4("Download and Extraction"),
|
254 |
+
Ol(
|
255 |
+
Li("Archive dump was used to download data from all the stackexchange sub urls, eg., math.stackexchange etc."),
|
256 |
+
Li("Raw data is in XML format with lot of metadata. We only used two files Posts.xml and Comments.xml"),
|
257 |
+
Li("We parsed using post_id to connect each question to answer and then to comments so our data has same hierarchy as stackexchange UI"),
|
258 |
+
Li("""
|
259 |
+
1. Questions:
|
260 |
+
2. Comment1:
|
261 |
+
3. Comment2:
|
262 |
+
4. Answer1:
|
263 |
+
5. Comment1:
|
264 |
+
6. Comment2:
|
265 |
+
7. Answer2:
|
266 |
+
8. Comment1:
|
267 |
+
9. Comment2:
|
268 |
+
"""),
|
269 |
+
),
|
270 |
+
H4("Filtering"),
|
271 |
+
Ol(
|
272 |
+
Li("Min word: 10"),
|
273 |
+
),
|
274 |
+
H4("Local Deduplication Process"),
|
275 |
+
Ol(
|
276 |
+
Li("Local dedup was done within stackexchange itself"),
|
277 |
+
),
|
278 |
+
H4("Global Deduplication Process"),
|
279 |
+
Ol(
|
280 |
+
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
281 |
+
),
|
282 |
+
|
283 |
+
),
|
284 |
+
Section(
|
285 |
+
H3("Ubuntu IRC"),
|
286 |
+
H4("Download and Extraction"),
|
287 |
+
Ol(
|
288 |
+
Li("All the data was downloaded from https://irclogs.ubuntu.com/{date.year}/{date.month:02d}/{date.day:02d}/ based on the year"),
|
289 |
+
Li("During extraction, we cleaned the logs using following functions"),
|
290 |
+
Li("image here"),
|
291 |
+
),
|
292 |
+
H4("Filtering"),
|
293 |
+
Ol(
|
294 |
+
Li("Min word: 10"),
|
295 |
+
Li("Language: Only english"),
|
296 |
+
Li("Unigram log probablity"),
|
297 |
+
),
|
298 |
+
H4("Local Deduplication Process"),
|
299 |
+
Ol(
|
300 |
+
Li("Local dedup was done within Ubuntu IRC itself"),
|
301 |
+
),
|
302 |
+
H4("Global Deduplication Process"),
|
303 |
+
Ol(
|
304 |
+
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
305 |
+
),
|
306 |
+
|
307 |
+
),
|
308 |
+
Section(
|
309 |
+
H3("DM Maths"),
|
310 |
+
H4("Download and Extraction"),
|
311 |
+
Ol(
|
312 |
+
Li("Directly downloaded from hugging-face dump dm_maths/"),
|
313 |
+
Li("Data was converted in jsonl format where each lines are : Question: TEXT Answer: TEXT"),
|
314 |
+
),
|
315 |
+
H4("Filtering"),
|
316 |
+
Ol(
|
317 |
+
Li("None"),
|
318 |
+
),
|
319 |
+
H4("Local Deduplication Process"),
|
320 |
+
Ol(
|
321 |
+
Li("None"),
|
322 |
+
),
|
323 |
+
H4("Global Deduplication Process"),
|
324 |
+
Ol(
|
325 |
+
Li("None"),
|
326 |
+
),
|
327 |
+
|
328 |
+
),
|
329 |
+
Section(
|
330 |
+
H3("PG19"),
|
331 |
+
H4("Download and Extraction"),
|
332 |
+
Ol(
|
333 |
+
Li("Directly downloaded from hugging-face dump pg19/"),
|
334 |
+
),
|
335 |
+
H4("Filtering"),
|
336 |
+
Ol(
|
337 |
+
Li("Min word: 20"),
|
338 |
+
Li("Language: ???"),
|
339 |
+
Li("Unigram log probablity"),
|
340 |
+
),
|
341 |
+
H4("Local Deduplication Process"),
|
342 |
+
Ol(
|
343 |
+
Li("Local dedup was done within PG19 itself"),
|
344 |
+
),
|
345 |
+
H4("Global Deduplication Process"),
|
346 |
+
Ol(
|
347 |
+
Li("After local dedup, remaining data was deduped again with all the datasets combined"),
|
348 |
+
),
|
349 |
+
|
350 |
+
),
|
351 |
+
)
|
352 |
+
|
353 |
+
|
354 |
+
|
355 |
+
|
356 |
overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
|
357 |
copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
|
358 |
|
359 |
local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
|
360 |
|
|
|
361 |
treemap_data = {
|
362 |
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
363 |
'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
|
|
|
810 |
table_html = preprocessing_steps.to_html(index=False, border=0)
|
811 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
812 |
data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
|
813 |
+
|
814 |
return Div(
|
|
|
815 |
H2("Curated Sources: Overview"),
|
816 |
overview_text,
|
817 |
copyright_disclaimer,
|
818 |
plotly2fasthtml(treemap_chart),
|
819 |
table_desc,
|
820 |
+
H2("Curated Sources Processing"),
|
821 |
+
filtering_process,
|
822 |
data_preparation_div,
|
823 |
H3("Data Filtering"),
|
824 |
data_preprocessing_div,
|
825 |
plotly2fasthtml(get_chart_28168342()),
|
826 |
H2("Local Deduplication"),
|
827 |
local_dedup_text,
|
828 |
+
table_div_data_pipe,
|
829 |
id="inner-text",
|
830 |
+
)
|
831 |
+
|
main.py
CHANGED
@@ -112,7 +112,7 @@ def main():
|
|
112 |
),
|
113 |
Li(
|
114 |
A(
|
115 |
-
"
|
116 |
href="/intro#section2",
|
117 |
hx_get="/intro#section2",
|
118 |
hx_target="#inner-text",
|
@@ -120,7 +120,7 @@ def main():
|
|
120 |
),
|
121 |
Li(
|
122 |
A(
|
123 |
-
"
|
124 |
href="/intro#section3",
|
125 |
hx_get="/intro#section3",
|
126 |
hx_target="#inner-text",
|
@@ -128,7 +128,7 @@ def main():
|
|
128 |
),
|
129 |
Li(
|
130 |
A(
|
131 |
-
"
|
132 |
href="/intro#section4",
|
133 |
hx_get="/intro#section4",
|
134 |
hx_target="#inner-text",
|
@@ -136,11 +136,11 @@ def main():
|
|
136 |
),
|
137 |
),
|
138 |
),
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
Div(
|
145 |
A("Global Processing Steps", href="#inner-text"),
|
146 |
hx_get="/common",
|
@@ -210,56 +210,337 @@ intro_list1 = Ol(
|
|
210 |
)
|
211 |
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
Section(
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
227 |
),
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
),
|
230 |
-
|
231 |
-
|
232 |
-
P(
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
),
|
242 |
-
|
243 |
),
|
|
|
|
|
|
|
|
|
|
|
244 |
Section(
|
245 |
-
|
246 |
-
P(
|
247 |
-
|
248 |
-
),
|
249 |
-
|
250 |
-
P(
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
"In the remainder of this blog post, we will walk you through the entire process and the rationale behind each decision. Enjoy!"
|
255 |
-
),
|
256 |
-
id="section4",
|
257 |
),
|
|
|
258 |
id="inner-text",
|
259 |
)
|
260 |
|
261 |
|
262 |
-
rt("/overview")(overview.overview)
|
263 |
rt("/curated")(curated.curated)
|
264 |
rt("/curated/{target}")(curated.update)
|
265 |
|
|
|
112 |
),
|
113 |
Li(
|
114 |
A(
|
115 |
+
"Motivation Behind Txt360",
|
116 |
href="/intro#section2",
|
117 |
hx_get="/intro#section2",
|
118 |
hx_target="#inner-text",
|
|
|
120 |
),
|
121 |
Li(
|
122 |
A(
|
123 |
+
"Generalizable Approach to Data Processing",
|
124 |
href="/intro#section3",
|
125 |
hx_get="/intro#section3",
|
126 |
hx_target="#inner-text",
|
|
|
128 |
),
|
129 |
Li(
|
130 |
A(
|
131 |
+
"Introducing Global Deduplication",
|
132 |
href="/intro#section4",
|
133 |
hx_get="/intro#section4",
|
134 |
hx_target="#inner-text",
|
|
|
136 |
),
|
137 |
),
|
138 |
),
|
139 |
+
# Div(
|
140 |
+
# A("Overview", href="#inner-text"),
|
141 |
+
# hx_get="/overview",
|
142 |
+
# hx_target="#inner-text",
|
143 |
+
# ),
|
144 |
Div(
|
145 |
A("Global Processing Steps", href="#inner-text"),
|
146 |
hx_get="/common",
|
|
|
210 |
)
|
211 |
|
212 |
|
213 |
+
dataset_comparison1 = pd.DataFrame(
|
214 |
+
{
|
215 |
+
"Dataset": [
|
216 |
+
"TxT360",
|
217 |
+
"FineWeb",
|
218 |
+
"RefinedWeb",
|
219 |
+
"RedPajama-v2",
|
220 |
+
"C4",
|
221 |
+
"Dolma",
|
222 |
+
"RedPajama-v1",
|
223 |
+
"The Pile",
|
224 |
+
],
|
225 |
+
"CommonCrawl": [
|
226 |
+
"99 Snapshots",
|
227 |
+
"96 Snapshots",
|
228 |
+
"90 Snapshots",
|
229 |
+
"84 Snapshots",
|
230 |
+
"1 Snapshots",
|
231 |
+
"24 Snapshots",
|
232 |
+
"5 Snapshots",
|
233 |
+
"0.6% of 74 Snapshots",
|
234 |
+
],
|
235 |
+
"Papers": [
|
236 |
+
"5 Sources",
|
237 |
+
"-",
|
238 |
+
"-",
|
239 |
+
"-",
|
240 |
+
"-",
|
241 |
+
"1 Source",
|
242 |
+
"1 Source",
|
243 |
+
"4 Sources",
|
244 |
+
],
|
245 |
+
"Wikipedia": [
|
246 |
+
"310+ Languages",
|
247 |
+
"-",
|
248 |
+
"-",
|
249 |
+
"-",
|
250 |
+
"-",
|
251 |
+
"what does a check mark mean?",
|
252 |
+
"what does a check mark mean?",
|
253 |
+
"English Only",
|
254 |
+
],
|
255 |
+
"FreeLaw": [
|
256 |
+
"Included",
|
257 |
+
"-",
|
258 |
+
"-",
|
259 |
+
"-",
|
260 |
+
"-",
|
261 |
+
"-",
|
262 |
+
"-",
|
263 |
+
"Included",
|
264 |
+
],
|
265 |
+
"DM Math": [
|
266 |
+
"Included",
|
267 |
+
"-",
|
268 |
+
"-",
|
269 |
+
"-",
|
270 |
+
"-",
|
271 |
+
"-",
|
272 |
+
"-",
|
273 |
+
"Included",
|
274 |
+
],
|
275 |
+
"USPTO": [
|
276 |
+
"Included",
|
277 |
+
"-",
|
278 |
+
"-",
|
279 |
+
"-",
|
280 |
+
"-",
|
281 |
+
"-",
|
282 |
+
"-",
|
283 |
+
"Included",
|
284 |
+
],
|
285 |
+
|
286 |
+
}
|
287 |
+
)
|
288 |
+
|
289 |
+
table_html = dataset_comparison1.to_html(index=False, border=0)
|
290 |
+
table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
|
291 |
+
|
292 |
+
dataset_comparison2 = pd.DataFrame(
|
293 |
+
{
|
294 |
+
"Dataset": [
|
295 |
+
"TxT360",
|
296 |
+
"FineWeb",
|
297 |
+
"RefinedWeb",
|
298 |
+
"RedPajama-v2",
|
299 |
+
"C4",
|
300 |
+
"Dolma",
|
301 |
+
"RedPajama-v1",
|
302 |
+
"The Pile",
|
303 |
+
],
|
304 |
+
|
305 |
+
"PG-19": [
|
306 |
+
"Included",
|
307 |
+
"-",
|
308 |
+
"-",
|
309 |
+
"-",
|
310 |
+
"-",
|
311 |
+
"Included",
|
312 |
+
"Included",
|
313 |
+
"Included",
|
314 |
+
],
|
315 |
+
"HackerNews": [
|
316 |
+
"Included",
|
317 |
+
"-",
|
318 |
+
"-",
|
319 |
+
"-",
|
320 |
+
"-",
|
321 |
+
"-",
|
322 |
+
"-",
|
323 |
+
"Included",
|
324 |
+
],
|
325 |
+
"Ubuntu IRC": [
|
326 |
+
"Included",
|
327 |
+
"-",
|
328 |
+
"-",
|
329 |
+
"-",
|
330 |
+
"-",
|
331 |
+
"-",
|
332 |
+
"-",
|
333 |
+
"Included",
|
334 |
+
],
|
335 |
+
"EuroParl": [
|
336 |
+
"Included",
|
337 |
+
"-",
|
338 |
+
"-",
|
339 |
+
"-",
|
340 |
+
"-",
|
341 |
+
"-",
|
342 |
+
"-",
|
343 |
+
"Included",
|
344 |
+
],
|
345 |
+
"StackExchange": [
|
346 |
+
"Included",
|
347 |
+
"-",
|
348 |
+
"-",
|
349 |
+
"-",
|
350 |
+
"-",
|
351 |
+
"-",
|
352 |
+
"Included",
|
353 |
+
"Included",
|
354 |
+
],
|
355 |
+
"Code": [
|
356 |
+
"- what is this?",
|
357 |
+
"-",
|
358 |
+
"-",
|
359 |
+
"-",
|
360 |
+
"-",
|
361 |
+
"Included",
|
362 |
+
"Included",
|
363 |
+
"Included",
|
364 |
+
],
|
365 |
+
}
|
366 |
+
)
|
367 |
+
|
368 |
+
table_html2 = dataset_comparison2.to_html(index=False, border=0)
|
369 |
+
table_div_2 = Div(NotStr(table_html2), style="margin: 40px;")
|
370 |
+
|
371 |
+
dataset_sources = pd.DataFrame(
|
372 |
+
{
|
373 |
+
"Data Source": [
|
374 |
+
"CommonCrawl",
|
375 |
+
"Papers",
|
376 |
+
"Wikipedia",
|
377 |
+
"Freelaw",
|
378 |
+
"DM Math",
|
379 |
+
"USPTO",
|
380 |
+
"PG-19",
|
381 |
+
"HackerNews",
|
382 |
+
"Ubuntu IRC",
|
383 |
+
"Europarl",
|
384 |
+
"StackExchange",
|
385 |
+
],
|
386 |
+
"Raw Data Size": [
|
387 |
+
"11 TB",
|
388 |
+
"712 GB",
|
389 |
+
"210 GB",
|
390 |
+
"23 GB",
|
391 |
+
"22 GB",
|
392 |
+
"45 GB",
|
393 |
+
"11 GB",
|
394 |
+
"4.1 GB",
|
395 |
+
"4.7 GB",
|
396 |
+
"6.1 GB",
|
397 |
+
"45 GB",
|
398 |
+
],
|
399 |
+
"Token Count": [
|
400 |
+
"5.71T",
|
401 |
+
"154.96B",
|
402 |
+
"4.75B",
|
403 |
+
"7.34B",
|
404 |
+
"5.23B",
|
405 |
+
"4.95B",
|
406 |
+
"2.94B",
|
407 |
+
"1.08B",
|
408 |
+
"1.54B",
|
409 |
+
"1.96B",
|
410 |
+
"8.37B",
|
411 |
+
],
|
412 |
+
"Cut-Off Date": [
|
413 |
+
"2024-30",
|
414 |
+
"Q4 2023",
|
415 |
+
"-",
|
416 |
+
"Q1 2024",
|
417 |
+
"-",
|
418 |
+
"Q4 2023",
|
419 |
+
"-",
|
420 |
+
"Q4 2023",
|
421 |
+
"Q4 2023",
|
422 |
+
"-",
|
423 |
+
"Q4 2023",
|
424 |
+
],
|
425 |
+
}
|
426 |
+
)
|
427 |
+
|
428 |
+
table_html_data = dataset_sources.to_html(index=False, border=0)
|
429 |
+
table_div_data = Div(NotStr(table_html_data), style="margin: 40px;")
|
430 |
+
|
431 |
+
|
432 |
+
def get_curated_chart():
|
433 |
+
# Dataset
|
434 |
+
data = {
|
435 |
+
'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
|
436 |
+
'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
|
437 |
+
'Count': [100, 200, 150, 120, 80, 90, 300, 250, 180, 150, 150, 250, 180, 120, 90],
|
438 |
+
'Details': [
|
439 |
+
'A repository of scientific papers in various disciplines, including computer science, physics, mathematics, and more.',
|
440 |
+
'A database of biomedical and life sciences research articles.',
|
441 |
+
'Abstracts of biomedical literature from various sources.',
|
442 |
+
'Full-text articles from the Semantic Scholar Open Research Corpus.',
|
443 |
+
'Abstracts of articles from the Semantic Scholar Open Research Corpus.',
|
444 |
+
'Papers from the PhilPapers database, a comprehensive index and bibliography of philosophy research.',
|
445 |
+
'A collaborative online encyclopedia that covers a wide range of topics.',
|
446 |
+
'A network of question-and-answer websites on various subjects, including programming, science, mathematics, and more.',
|
447 |
+
'A collection of multilingual parallel corpora of parliamentary debates from the European Parliament.',
|
448 |
+
'Chat logs from the Ubuntu Internet Relay Chat (IRC) channels.',
|
449 |
+
'Legal documents and court cases from various jurisdictions.',
|
450 |
+
'A collection of books from Project Gutenberg, a digital library of public domain works.',
|
451 |
+
'Patent documents from the United States Patent and Trademark Office.',
|
452 |
+
'User-generated news and discussion platform focused on technology and startups.',
|
453 |
+
'Deep Mind Maths dataset with generated questions.'
|
454 |
+
]
|
455 |
+
}
|
456 |
+
# Calculate percentage for each data source
|
457 |
+
total_count = sum(data['Count'])
|
458 |
+
data['Percentage'] = [count / total_count * 100 for count in data['Count']]
|
459 |
+
|
460 |
+
# Create treemap
|
461 |
+
fig = px.treemap(data, path=['Category', 'Source'], values='Count', hover_data=['Details', 'Percentage'], hover_name='Source')
|
462 |
+
|
463 |
+
# Set the size of the chart
|
464 |
+
fig.update_layout(width=800, height=600)
|
465 |
+
|
466 |
+
# Display treemap
|
467 |
+
st.plotly_chart(fig)
|
468 |
+
|
469 |
+
overview_div = Div(
|
470 |
Section(
|
471 |
+
H2("Overview"),
|
472 |
+
H3("What This Section Contains"),
|
473 |
+
Ul(
|
474 |
+
Li("Motivation Behind TxT360", style = "margin-bottom: 5px"),
|
475 |
+
Li("The Highlevel Data Process Approach", style = "margin-bottom: 5px"),
|
476 |
+
Li("Introduction to Global Deduplication", style = "margin-bottom: 5px"),
|
477 |
),
|
478 |
+
H2("Motivation Behind Txt360"),
|
479 |
+
H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
|
480 |
+
P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets."),
|
481 |
+
P("In pretraining, it is common to combine web data and curated sources (cite). Web data is included to provide a vast quantity of long tail and diverse data, while curated datasets are often information rich and provide the 'deep-dive' domain information. Both datasets play critical for effective LLM pre-training."),
|
482 |
+
H4("The Gap TxT360 Fills"),
|
483 |
+
P("Despite advancements in filtering and source material for both data types, each type of dataset has its limitations. RefinedWeb is known for its high quality content but and only about 10% of the entire dataset has been disclosed and the processing scripts have not been released. For datasets that have combined curated sources with web data, the web component is relatively small (NEED TO UPDATE - citation needed)."),
|
484 |
+
P("By integrating the extensive reach of web data with the exceptional quality of curated sources, TxT360 meets and surpasess the rigorous standards required for state-of-the-art LLM pre-training as demostated in the Results section."),
|
485 |
+
|
486 |
+
P("Table 1: The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
|
487 |
+
table_div_1,
|
488 |
+
table_div_2,
|
489 |
+
P("Table 2: Basic TxT360 Statistics."),
|
490 |
+
table_div_data,
|
491 |
),
|
492 |
+
Section(
|
493 |
+
H2("Our Generalizable Approach to Data Processing"),
|
494 |
+
P("To produce TxT360, a comprehensive and transparent data processing pipeline was designed to account for the nuances of both web and curated datasets. The pipeline presents a unified framework for processing both data types, making it convenient and easily adaptive for users to revise and fine-tune the pipeline for their own use cases."),
|
495 |
+
P("Web datasets are inherently noisy and varied. Thus, the pipeline includes sophisticated filtering and deduplication techniques to clean the data and remove redundancies or irrelevant information."),
|
496 |
+
P("Curated datasets are typically already structured and consistently formatted. TxT360 filters these sources with selective steps to maintain their integrity while providing seamless integration into the larger dataset."),
|
497 |
+
P("We will open-source the scripts for the whole pipeline, allowing the community to review, replicate, and build upon our processes."),
|
498 |
+
Img(src="images/pipeline.png", height = "300", width = "600" ),
|
499 |
+
P("Figure 1: Data processing pipeline. All the steps are adopted for processing web data while the yellow blocks are adopted for processing curated sources."),
|
500 |
+
),
|
501 |
+
Section(
|
502 |
+
H2("Introducing Global Deduplication"),
|
503 |
+
P("Deduplication is crucial in language model pre-training for several reasons (NEEDS CITATION). Main hypothesis on why deduplicaiton is necessary and help include:"),
|
504 |
+
Ul(
|
505 |
+
Li("reducing data volume reduces training time (and cost)", style = "margin-bottom: 5px"),
|
506 |
+
Li("removing duplicataive data can lead to better accuracy", style = "margin-bottom: 5px"),
|
507 |
+
Li("prevent train-test overlap", style = "margin-bottom: 5px"),
|
508 |
+
Li("minimizes the risk of memorization leading to test loss", style = "margin-bottom: 5px"),
|
509 |
+
),
|
510 |
+
P("Furthermore, by controlling the pretraining data distribution through deduplication and selective upsampling avoida relying on the often inconsistent distribution of internet-sourced data."),
|
511 |
+
H3("Highlights of the TxT360 Deduplication Process"),
|
512 |
+
P("Our deduplication process began with 61.8 TB of filtered, and compressed documents, totaling approximately 48.83 billion documents. We first performed exact deduplication using a Bloom filter, reducing the dataset by 17% to 40.21 billion documents. For global near-deduplication, we scaled methodologies from prior works like SlimPajama to handle the entire dataset, including 87 Common Crawl dumps and other curated data. This involved generating document signatures, matching them to identify near-duplicates, and clustering these to retain only one document per cluster. We prioritized retaining documents from curated sources and more recent CommonCrawl dumps. Throughout the deduplication stages, we extensively used Dask for distributed data processing."),
|
513 |
+
P("After the global near-deduplication of all 87 CommonCrawl dumps and other curated data, we removed around 85% of the total documents. This leaves us with approximately 4.24 trillion deduplicated tokens, which aligns with what FineWeb has reported for their iterative global deduplication. Along with the list of duplicated documents to delete, our deduplication code also saves some metadata about the duplicate clusters that we find. We save statistics about every duplicate cluster we find, with the document ID of the document we retain from the cluster as the key and with a value capturing the distribution of the duplicates within the cluster over the CommonCrawl dumps (identified by the first 2 digits of every document ID). This way, we always have information about the duplicates we have deleted, allowing us to upsample any data distribution we want for training."),
|
514 |
+
P("During deduplication, it is not feasible to store all the duplicate clusters we form, but we do save some samples at every size. Here are some observations we made by examining these sample duplicate clusters:"),
|
515 |
+
Ul(
|
516 |
+
Li("Smaller components tend to have more overlap in their MinHash bands. The smallest components, which are essentially pairs, consist of exact duplicate documents that local exact deduplication missed."),
|
517 |
+
Li("When clusters contain three or more documents, incremental changes in the text become apparent. For example, there may be a growing list of personnel over the years."),
|
518 |
+
Li("In sizable clusters comprising 1,000 or more documents, we observe a trend towards templatization. This involves the recurrent use of standardized language to convey general topics such as terms and conditions, warnings, and disclaimers. Such language is prevalent on commercial websites, offering a consistent and efficient way to communicate commonly encountered information."),
|
519 |
),
|
520 |
+
P("All Deduplication details can be found in the Global Data Processing Steps section."),
|
521 |
),
|
522 |
+
)
|
523 |
+
|
524 |
+
@app.get("/intro")
|
525 |
+
def intro():
|
526 |
+
return Div(
|
527 |
Section(
|
528 |
+
H2("About TxT360"),
|
529 |
+
P("We introduce Trillion eXtracted Text (TxT360): the highest quality, most comprehensive, and production ready pretraining dataset released to date, an open source pre training textual dataset with the widest coverage to date, including all 99 Common Crawl snapshots released to date and 14 widely used curated sources such as Wikipedia, StackExchange, and ArXiv."),
|
530 |
+
|
531 |
+
P("Building on top of the prior study on pre-training data, (cite C4, chinchilla, fineweb, refinedweb, 2305.16264, ) TxT360 carefully implements data processing steps such as extraction, filtering, deduplication, PII removal and so on. Notably, we design a schema to provide users with full control on the weights of each data point, instead of relying on unknown distribution from the raw dataset. Specifically, we strive to make each data point appear only once, via global deduplication across the whole corpus. Importantly, we store enough metadata to recover the raw distribution, and additionally allow one to have fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM and RedPajama V2, we present the final deduplicated dataset that is ready to go."),
|
532 |
+
|
533 |
+
P("We have carefully implemented a large set of filters by carefully reviewing the input, output from data samples and tailored design for each data source. We learned from and compared our implementation with prior data pipelines, such as Chinchilla, C4, DataTrove, Dolma, and RedPajama Data V2 to make carefully considered decisions."),
|
534 |
+
P("In light of the 360 open source spirit, we will open source the code and document all the implementation details in this blog post, along with the examples and why each decision is made, hoping to shed some light on this important but tedious step."),
|
535 |
+
|
536 |
+
id="section1",
|
|
|
|
|
|
|
537 |
),
|
538 |
+
overview_div,
|
539 |
id="inner-text",
|
540 |
)
|
541 |
|
542 |
|
543 |
+
#rt("/overview")(overview.overview)
|
544 |
rt("/curated")(curated.curated)
|
545 |
rt("/curated/{target}")(curated.update)
|
546 |
|
results.py
CHANGED
@@ -125,47 +125,42 @@ fig.update_layout(
|
|
125 |
|
126 |
Perplexity_Across_Different_Buckets_global_graph = fig
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
# The data you provided
|
131 |
-
DATA = [
|
132 |
-
["2014", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.410227605477868, 16.11176217183986, 15.632757662414805, 15.446116676532212, 16.716943171826703, 18.156821563322765]]],
|
133 |
-
["2015", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.446573602753478, 16.14852530113782, 15.627408549576069, 15.0055028132117, 15.565430373421485, 17.314701050452452]]],
|
134 |
-
["2016", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.307221780905284, 16.297702171159543, 15.948641884223639, 14.799690714225637, 14.935989931859659, 16.09585768919658]]],
|
135 |
-
["2017", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.338525603992114, 15.960924352297502, 15.912187993988933, 14.822102470001267, 14.778913482337416, 15.428145290012955]]],
|
136 |
-
["2018", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.08551151136689, 16.187802102106698, 14.935072408852303, 14.832038213200583, 14.508674264491997, 14.800605964649103]]],
|
137 |
-
["2019", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.818363305107052, 16.474269837858706, 14.944741674400241, 14.568394784374943, 14.690158822673334, 15.990949424635108]]],
|
138 |
-
["2020", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.98821894111693, 15.936494557783181, 14.79960386342691, 14.435682562274105, 14.58651834886038, 15.869365567783806]]],
|
139 |
-
["2021", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [17.125795647512877, 15.780419457145868, 14.631430892394002, 14.276477514399625, 14.337146941773641, 15.872474774329305]]],
|
140 |
-
["2022", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [16.573462144306383, 15.283018703313582, 14.378277745163881, 14.0611924390084, 13.9886330091318, 15.769421394877273]]],
|
141 |
-
["2023", [["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"], [15.4293630385597, 14.608379914730168, 14.118271697056592, 13.880215644749589, 13.767106666731275, 15.05749135510839]]]
|
142 |
-
]
|
143 |
|
144 |
-
#
|
145 |
-
years = [
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
-
# Create
|
150 |
fig = go.Figure()
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
fig.add_trace(go.Scatter(x=years, y=values, mode='lines+markers', name=range_label))
|
156 |
|
157 |
# Update layout
|
158 |
fig.update_layout(
|
159 |
-
title="Perplexity
|
160 |
xaxis_title="Year",
|
161 |
-
yaxis_title="Perplexity",
|
162 |
legend_title="Buckets",
|
163 |
hovermode="x unified"
|
164 |
)
|
165 |
|
166 |
-
# Show
|
167 |
-
|
168 |
-
Perplexity_Across_Different_years_graph = fig
|
169 |
|
170 |
#graph 3 tbd
|
171 |
|
@@ -794,13 +789,15 @@ perp1_div = Div(
|
|
794 |
H3("Perplexity vs Years"),
|
795 |
P("Taking the same data, we can convert it into a graph indicating the yearly trend. For most buckets, the average perplexity of dumps from more recent years seem to be lower than that of former years."),
|
796 |
Img(src="images/prep-across-diff-year-global-dup-buckets.png", height = "300", width = "600" ),
|
797 |
-
plotly2fasthtml(
|
|
|
798 |
),
|
799 |
Section(
|
800 |
H3("Perplexity vs Document Duplication"),
|
801 |
P("We can also break each bucket into distinct document counts. The graph becomes a bit noisy at the end because of insufficient samples with larger duplication counts."),
|
802 |
Img(src="images/prep-across-diff-docs-dup-count-global.png", height = "300", width = "600" ),
|
803 |
plotly2fasthtml(graph3),
|
|
|
804 |
),
|
805 |
Section(
|
806 |
H3("Perplexity vs Dump Duplication"),
|
|
|
125 |
|
126 |
Perplexity_Across_Different_Buckets_global_graph = fig
|
127 |
|
128 |
+
##graph 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
+
# Data
|
131 |
+
years = ["2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023"]
|
132 |
+
buckets = ["1-1", "2-5", "6-10", "11-100", "101-1000", "1001-30000000"]
|
133 |
+
data = {
|
134 |
+
"2014": [17.410227605477868, 16.11176217183986, 15.632757662414805, 15.446116676532212, 16.716943171826703, 18.156821563322765],
|
135 |
+
"2015": [17.446573602753478, 16.14852530113782, 15.627408549576069, 15.0055028132117, 15.565430373421485, 17.314701050452452],
|
136 |
+
"2016": [17.307221780905284, 16.297702171159543, 15.948641884223639, 14.799690714225637, 14.935989931859659, 16.09585768919658],
|
137 |
+
"2017": [17.338525603992114, 15.960924352297502, 15.912187993988933, 14.822102470001267, 14.778913482337416, 15.428145290012955],
|
138 |
+
"2018": [17.08551151136689, 16.187802102106698, 14.935072408852303, 14.832038213200583, 14.508674264491997, 14.800605964649103],
|
139 |
+
"2019": [16.818363305107052, 16.474269837858706, 14.944741674400241, 14.568394784374943, 14.690158822673334, 15.990949424635108],
|
140 |
+
"2020": [16.98821894111693, 15.936494557783181, 14.79960386342691, 14.435682562274105, 14.58651834886038, 15.869365567783806],
|
141 |
+
"2021": [17.125795647512877, 15.780419457145868, 14.631430892394002, 14.276477514399625, 14.337146941773641, 15.872474774329305],
|
142 |
+
"2022": [16.573462144306383, 15.283018703313582, 14.378277745163881, 14.0611924390084, 13.9886330091318, 15.769421394877273],
|
143 |
+
"2023": [15.4293630385597, 14.608379914730168, 14.118271697056592, 13.880215644749589, 13.767106666731275, 15.05749135510839]
|
144 |
+
}
|
145 |
|
146 |
+
# Create a line plot for each bucket
|
147 |
fig = go.Figure()
|
148 |
|
149 |
+
for i, bucket in enumerate(buckets):
|
150 |
+
bucket_values = [data[year][i] for year in years]
|
151 |
+
fig.add_trace(go.Scatter(x=years, y=bucket_values, mode='lines+markers', name=bucket))
|
|
|
152 |
|
153 |
# Update layout
|
154 |
fig.update_layout(
|
155 |
+
title="Average Perplexity Over Years by Bucket",
|
156 |
xaxis_title="Year",
|
157 |
+
yaxis_title="Average Perplexity",
|
158 |
legend_title="Buckets",
|
159 |
hovermode="x unified"
|
160 |
)
|
161 |
|
162 |
+
# Show plot
|
163 |
+
graph2 = fig
|
|
|
164 |
|
165 |
#graph 3 tbd
|
166 |
|
|
|
789 |
H3("Perplexity vs Years"),
|
790 |
P("Taking the same data, we can convert it into a graph indicating the yearly trend. For most buckets, the average perplexity of dumps from more recent years seem to be lower than that of former years."),
|
791 |
Img(src="images/prep-across-diff-year-global-dup-buckets.png", height = "300", width = "600" ),
|
792 |
+
plotly2fasthtml(graph2),
|
793 |
+
P("NEED TO UPDATE - THIS GRAPH SHOULD MATCH THE IMAGE ABOVE AND YEAR SHOULD NOT BE a LINE OPTION"),
|
794 |
),
|
795 |
Section(
|
796 |
H3("Perplexity vs Document Duplication"),
|
797 |
P("We can also break each bucket into distinct document counts. The graph becomes a bit noisy at the end because of insufficient samples with larger duplication counts."),
|
798 |
Img(src="images/prep-across-diff-docs-dup-count-global.png", height = "300", width = "600" ),
|
799 |
plotly2fasthtml(graph3),
|
800 |
+
P("NEED TO UPDATE - THIS GRAPH SHOULD MATCH THE IMAGE ABOVE AND BUCKET SHOULD NOT BE a LINE OPTION"),
|
801 |
),
|
802 |
Section(
|
803 |
H3("Perplexity vs Dump Duplication"),
|