victormiller
commited on
Commit
•
6a336ca
1
Parent(s):
66a1161
Update curated.py
Browse files- curated.py +13 -13
curated.py
CHANGED
@@ -539,7 +539,7 @@ data_preprocessing_div = Div(
|
|
539 |
P(
|
540 |
"The ",
|
541 |
B("Minimum Word Count Filter"),
|
542 |
-
" sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each
|
543 |
),
|
544 |
P(
|
545 |
"The ",
|
@@ -570,7 +570,7 @@ data_preprocessing_div = Div(
|
|
570 |
P(
|
571 |
"The ",
|
572 |
B("Paragraph Count Filter"),
|
573 |
-
" counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful
|
574 |
),
|
575 |
P(
|
576 |
"The ",
|
@@ -659,7 +659,7 @@ filtering_process = Div(
|
|
659 |
),
|
660 |
P(
|
661 |
B("Filtering: "),
|
662 |
-
"Manual inspection of the dataset
|
663 |
),
|
664 |
table_div_wikipedia,
|
665 |
Details(
|
@@ -694,10 +694,10 @@ filtering_process = Div(
|
|
694 |
),
|
695 |
". All markdowns were combined to create jsonl files.",
|
696 |
),
|
697 |
-
P(B("Unique Data
|
698 |
Ul(
|
699 |
Li(
|
700 |
-
"Due to large amounts of meaningful data being contained in table formats,
|
701 |
style="margin-bottom: -3px",
|
702 |
),
|
703 |
),
|
@@ -715,7 +715,7 @@ filtering_process = Div(
|
|
715 |
style="margin-bottom: -3px",
|
716 |
),
|
717 |
Li(
|
718 |
-
"Unigram Log
|
719 |
style="margin-bottom: -3px",
|
720 |
),
|
721 |
Li(
|
@@ -859,7 +859,7 @@ filtering_process = Div(
|
|
859 |
D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),
|
860 |
". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
|
861 |
),
|
862 |
-
P(B("Unique Data
|
863 |
Ul(
|
864 |
Li(
|
865 |
"Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.",
|
@@ -1112,7 +1112,7 @@ filtering_process = Div(
|
|
1112 |
P(
|
1113 |
"The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level)."
|
1114 |
),
|
1115 |
-
P(B("Unique Data
|
1116 |
Ul(
|
1117 |
Li(
|
1118 |
"As discussed above, the comment heirarchies required a thoughful approach to extracting meaningful data. ",
|
@@ -1190,7 +1190,7 @@ filtering_process = Div(
|
|
1190 |
P(
|
1191 |
"All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
|
1192 |
),
|
1193 |
-
P(B("Unique Data
|
1194 |
Ul(
|
1195 |
Li(
|
1196 |
"Consecutive whitespaces and tabs were found. Consecutive Whitespaces and tabes were reduce to one, single whitespace.",
|
@@ -1261,7 +1261,7 @@ filtering_process = Div(
|
|
1261 |
block="block",
|
1262 |
language="python",
|
1263 |
),
|
1264 |
-
P(B("Unique Data
|
1265 |
Ul(
|
1266 |
Li(
|
1267 |
"Handling code block was a required finding the specific blocks and exacting the details in one snippet.",
|
@@ -1328,7 +1328,7 @@ filtering_process = Div(
|
|
1328 |
block="block",
|
1329 |
language="python",
|
1330 |
),
|
1331 |
-
P(B("Unique Data
|
1332 |
Ul(
|
1333 |
Li(
|
1334 |
"Similar to the HackerNews challenges, we had to map comments and sub-comments to the original question.",
|
@@ -1366,7 +1366,7 @@ filtering_process = Div(
|
|
1366 |
block="block",
|
1367 |
language="python",
|
1368 |
),
|
1369 |
-
P(B("Unique Data
|
1370 |
Ul(
|
1371 |
Li(
|
1372 |
"A byte string was included at the beginning of new lines",
|
@@ -1409,7 +1409,7 @@ filtering_process = Div(
|
|
1409 |
),
|
1410 |
".",
|
1411 |
),
|
1412 |
-
P(B("Unique Data
|
1413 |
Ul(
|
1414 |
Li(
|
1415 |
"Consecutive whitespaces were found spanning 10+ whitespace entries. These whitespaces were reduce to one, single whitespace.",
|
|
|
539 |
P(
|
540 |
"The ",
|
541 |
B("Minimum Word Count Filter"),
|
542 |
+
" sets a threshold for required words within a document. This step filters out low-quality or incomplete documents. However, this step may remove documents that contain valuable information so a proper analysis is important for each data source.",
|
543 |
),
|
544 |
P(
|
545 |
"The ",
|
|
|
570 |
P(
|
571 |
"The ",
|
572 |
B("Paragraph Count Filter"),
|
573 |
+
" counts the number of paragraphs in each document. This step helps to analyze the structure and length of documents which can be a useful heuristic for document complexity.",
|
574 |
),
|
575 |
P(
|
576 |
"The ",
|
|
|
659 |
),
|
660 |
P(
|
661 |
B("Filtering: "),
|
662 |
+
"Manual inspection of the dataset demonstrated high quality content. Only one filter was used to remove articles with few words. Based normal sentence constructs, the article was kept if it contained 10 or more words. Any article with fewer than 10 words was removed.",
|
663 |
),
|
664 |
table_div_wikipedia,
|
665 |
Details(
|
|
|
694 |
),
|
695 |
". All markdowns were combined to create jsonl files.",
|
696 |
),
|
697 |
+
P(B("Unique Data Preparation Challenges: ")),
|
698 |
Ul(
|
699 |
Li(
|
700 |
+
"Due to large amounts of meaningful data being contained in table formats, special consideration was taken to extract the data and proper metadata.",
|
701 |
style="margin-bottom: -3px",
|
702 |
),
|
703 |
),
|
|
|
715 |
style="margin-bottom: -3px",
|
716 |
),
|
717 |
Li(
|
718 |
+
"Unigram Log Probability Filter Threshold: -20",
|
719 |
style="margin-bottom: -3px",
|
720 |
),
|
721 |
Li(
|
|
|
859 |
D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),
|
860 |
". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format.",
|
861 |
),
|
862 |
+
P(B("Unique Data Preparation Challenges: ")),
|
863 |
Ul(
|
864 |
Li(
|
865 |
"Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.",
|
|
|
1112 |
P(
|
1113 |
"The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level)."
|
1114 |
),
|
1115 |
+
P(B("Unique Data Preparation Challenges: ")),
|
1116 |
Ul(
|
1117 |
Li(
|
1118 |
"As discussed above, the comment heirarchies required a thoughful approach to extracting meaningful data. ",
|
|
|
1190 |
P(
|
1191 |
"All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."
|
1192 |
),
|
1193 |
+
P(B("Unique Data Preparation Challenges: ")),
|
1194 |
Ul(
|
1195 |
Li(
|
1196 |
"Consecutive whitespaces and tabs were found. Consecutive Whitespaces and tabes were reduce to one, single whitespace.",
|
|
|
1261 |
block="block",
|
1262 |
language="python",
|
1263 |
),
|
1264 |
+
P(B("Unique Data Preparation Challenges: ")),
|
1265 |
Ul(
|
1266 |
Li(
|
1267 |
"Handling code block was a required finding the specific blocks and exacting the details in one snippet.",
|
|
|
1328 |
block="block",
|
1329 |
language="python",
|
1330 |
),
|
1331 |
+
P(B("Unique Data Preparation Challenges: ")),
|
1332 |
Ul(
|
1333 |
Li(
|
1334 |
"Similar to the HackerNews challenges, we had to map comments and sub-comments to the original question.",
|
|
|
1366 |
block="block",
|
1367 |
language="python",
|
1368 |
),
|
1369 |
+
P(B("Unique Data Preparation Challenges: ")),
|
1370 |
Ul(
|
1371 |
Li(
|
1372 |
"A byte string was included at the beginning of new lines",
|
|
|
1409 |
),
|
1410 |
".",
|
1411 |
),
|
1412 |
+
P(B("Unique Data Preparation Challenges: ")),
|
1413 |
Ul(
|
1414 |
Li(
|
1415 |
"Consecutive whitespaces were found spanning 10+ whitespace entries. These whitespaces were reduce to one, single whitespace.",
|