victormiller
commited on
Update curated.py
Browse files- curated.py +45 -7
curated.py
CHANGED
@@ -551,6 +551,9 @@ filtering_process = Div(
|
|
551 |
H3("ArXiv"),
|
552 |
P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
|
553 |
P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
|
|
|
|
|
|
554 |
P(B(" Filters Applied: "), "multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset (citation needed)"),
|
555 |
Ul(
|
556 |
Li("Language Filter: any language other than English are discarded", style = "margin-bottom: -3px"),
|
@@ -639,6 +642,10 @@ filtering_process = Div(
|
|
639 |
Div(
|
640 |
H3("PubMed Central and PubMed Abstract"),
|
641 |
P(B("Download and Extraction: "), "All files were downloaded from", A("ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc", D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format."),
|
|
|
|
|
|
|
|
|
642 |
P(B("Filters Applied: "), "Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
|
643 |
Ul(
|
644 |
Li("Minimum Word Count Filter: PMC documents with less than 100 words (not inclusive) are discarded; PMA documents less than 20 words are discarded", style = "margin-bottom: -3px"),
|
@@ -699,7 +706,7 @@ filtering_process = Div(
|
|
699 |
),
|
700 |
Section(
|
701 |
Div(
|
702 |
-
H3("
|
703 |
P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
|
704 |
P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
705 |
P(B("Filters Applied: ") ,"EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained HTML tags which were removed."),
|
@@ -750,6 +757,11 @@ filtering_process = Div(
|
|
750 |
P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
|
751 |
P(B("Download and Extraction: "), "The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
752 |
P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
|
|
|
|
|
|
|
|
|
|
|
753 |
P(B("Filters Applied: ")),
|
754 |
Ul(
|
755 |
Li("Language Filter: English", style = "margin-bottom: -3px"),
|
@@ -779,15 +791,20 @@ filtering_process = Div(
|
|
779 |
P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
|
780 |
P(B("Download and Extraction"), "The dataset was downloaded from:", A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), ". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
781 |
D_code("""
|
782 |
-
("html", html2text),
|
783 |
-
("
|
784 |
-
("
|
785 |
-
("html_anon_2020", html2text),
|
786 |
-
("html_with_citations", html2text),
|
787 |
-
("xml_harvard", html2text),
|
788 |
plain_text
|
789 |
""", language ="SQL"),
|
790 |
P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
791 |
P(B("Filters Applied: ")),
|
792 |
Ul(
|
793 |
Li("Language Filter: English", style = "margin-bottom: -3px"),
|
@@ -828,6 +845,12 @@ filtering_process = Div(
|
|
828 |
8. Comment1:
|
829 |
9. Comment2:
|
830 |
"""),
|
|
|
|
|
|
|
|
|
|
|
|
|
831 |
P(B("Filters Applied: ")),
|
832 |
Ul(
|
833 |
Li("Minimum Word Count Filter: 10", style = "margin-bottom: -3px"),
|
@@ -866,6 +889,10 @@ filtering_process = Div(
|
|
866 |
def clean(x):
|
867 |
return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
|
868 |
""", block="block", language="python" ),
|
|
|
|
|
|
|
|
|
869 |
P(B("Filters Applied: ")),
|
870 |
Ul(
|
871 |
Li("Language Filter: English", style = "margin-bottom: -3px"),
|
@@ -883,6 +910,11 @@ filtering_process = Div(
|
|
883 |
D_code("""
|
884 |
Question: TEXT
|
885 |
Answer: TEXT""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
886 |
P(B("Filters Applied: ")),
|
887 |
Ul(
|
888 |
Li("No filtering was applied to DM Math", style = "margin-bottom: -3px"),
|
@@ -908,6 +940,12 @@ filtering_process = Div(
|
|
908 |
H3("PG-19"),
|
909 |
P("A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919."),
|
910 |
P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface: ", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
|
|
|
|
|
|
|
|
|
|
|
|
911 |
P(B("Filters Applied:")),
|
912 |
Ul(
|
913 |
Li("Language Filter: English", style = "margin-bottom: -3px"),
|
|
|
551 |
H3("ArXiv"),
|
552 |
P("The ArXiv dataset is a vast collection of preprint research papers primarily in Mathematics, Computer Science, and Physics. Established in 1991, it offers high-quality text and mathematical knowledge, making it an invaluable resource for academic and scientific research. ArXiv papers are typically written in LaTeX, a popular typesetting system for these fields. We have extracted the information from latex and converted it into a text format."),
|
553 |
P(B("Download and Extraction: "),"All the data was downloaded in original latex format from Arxiv official S3 dump ", A("s3://arxic/src", href="s3://arxic/src"), ". We try to encode the downloaded data into utf-8 or guess encoding using chardet library. After that pandoc was used to extract information from the latex files and saved as markdown format", D_code("pandoc -s {tex} -o out/{out_name}.md --wrap=none", language="python"), ". All markdowns were combined to create jsonl files."),
|
554 |
+
Ul(
|
555 |
+
Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
|
556 |
+
),
|
557 |
P(B(" Filters Applied: "), "multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset (citation needed)"),
|
558 |
Ul(
|
559 |
Li("Language Filter: any language other than English are discarded", style = "margin-bottom: -3px"),
|
|
|
642 |
Div(
|
643 |
H3("PubMed Central and PubMed Abstract"),
|
644 |
P(B("Download and Extraction: "), "All files were downloaded from", A("ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/",href="ttps://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/"),". PubMed Central (PMC) files are downloaded in an xml.tar format. The tar files are opened and converted to markdown format using pandoc", D_code("pandoc -f jats {nxml} -o {pmcid}.md", language="bash"),". The markdown files are combined to create jsonl files. PubMed Abstract (PMA) files were downloaded in xml. The BeautifulSoup library was used to extract the abstract, title, and PMID. All files were stored in jsonl format."),
|
645 |
+
P(B("Unique Data Preperation Challenges: ")),
|
646 |
+
Ul(
|
647 |
+
Li("Due to large amounts of meaningful data being contained in table formats, speical consideration was taken to extract the data and proper metadata.", style = "margin-bottom: -3px"),
|
648 |
+
),
|
649 |
P(B("Filters Applied: "), "Multiple filters are used here after manually verifying output of all the filters as suggested by peS2o dataset."),
|
650 |
Ul(
|
651 |
Li("Minimum Word Count Filter: PMC documents with less than 100 words (not inclusive) are discarded; PMA documents less than 20 words are discarded", style = "margin-bottom: -3px"),
|
|
|
706 |
),
|
707 |
Section(
|
708 |
Div(
|
709 |
+
H3("EuroParl"),
|
710 |
P("A collection of multilingual parallel corpora of parliamentary debates from the European Parliament. This is a high-quality legacy dataset earlier used for translation tasks."),
|
711 |
P(B("Download and Extraction: "), "Original dataset was downloaded from", A("http://www.statmt.org/europarl/v7/europarl.tgz", href="http://www.statmt.org/europarl/v7/europarl.tgz"),". The files were converted to jsonl lines for filtering."),
|
712 |
P(B("Filters Applied: ") ,"EuroParl was initially filtered during the download process. Documents with fewer than 200 characters were removed. The documents also contained HTML tags which were removed."),
|
|
|
757 |
P("High-quality dialog-based dataset where user comments on the links as the head post aggregated by Y Combinator."),
|
758 |
P(B("Download and Extraction: "), "The dataset was downloaded from the HackerNews repo here:", A("https://hacker-news.firebaseio.com/v0/item/", href="https://hacker-news.firebaseio.com/v0/item/"), ". The dataset was parsed using the Story ID. In this dataset each post is a story, and each reply is considered subsequent story. Story IDs were considered between ID 1 to 37500000. The URL for all Story IDs was pinged. If that ID returned an error, the ID was removed. Each request was given a 2 second wait to account for network time."),
|
759 |
P("The HackerNews dataset contains a vast amount of stories and is known for lively discussions. Due to the number of replies a story may contain, only longest comment thread for each story was sampled past level 3. All stories included the title (1st level) and all direct replies (2nd level). Replies to the replies (3rd level) are only included for X STORIES."),
|
760 |
+
P(B("Unique Data Preperation Challenges: ")),
|
761 |
+
Ul(
|
762 |
+
Li("As discussed above, the comment heirarchies required a thoughful approach to extracting meaningful data. ", style = "margin-bottom: -3px"),
|
763 |
+
Li("In the comment thread heirarchy, relationships had to be assigned to between the comments, sub-comments, and original story ID. ", style = "margin-bottom: -3px"),
|
764 |
+
),
|
765 |
P(B("Filters Applied: ")),
|
766 |
Ul(
|
767 |
Li("Language Filter: English", style = "margin-bottom: -3px"),
|
|
|
791 |
P("Legal documents and court cases from various jurisdictions provided by US-registered non-profit firm Free Law Project. We have included data from CourtListener which included millions of legal opinions from federal and state courts."),
|
792 |
P(B("Download and Extraction"), "The dataset was downloaded from:", A("https://storage.courtlistener.com/bulk-data/", href="https://storage.courtlistener.com/bulk-data/"), ". There are 19 CSV files which contain overlapping content. CSV files can contain content in multiple columns requiring a holistic extraction approach. Text was extracted from the following using html2text function. The block below shows how each text type was extracted."),
|
793 |
D_code("""
|
794 |
+
("html", html2text), ("html_lawbox", html2text),
|
795 |
+
("html_columbia", html2text), ("html_anon_2020", html2text),
|
796 |
+
("html_with_citations", html2text), ("xml_harvard", html2text),
|
|
|
|
|
|
|
797 |
plain_text
|
798 |
""", language ="SQL"),
|
799 |
P("All content was downloaded leading to high number of documents filtered during local deduplication. Following The Pile, priorty was given to plain_text first, followed by the columns in the table in reverse order."),
|
800 |
+
P(B("Unique Data Preperation Challenges: ")),
|
801 |
+
Ul(
|
802 |
+
Li("Consecutive whitespaces and tabs were found. Consecutive Whitespaces and tabes were reduce to one, single whitespace.", style = "margin-bottom: -3px"),
|
803 |
+
Li("Whitespaces were found between new lines with no addition text. These whitespaces were removed.", style = "margin-bottom: -3px"),
|
804 |
+
Li("Consecutive new lines were found in some documents without leading to a new paragraph. All consecutive newline to a single new line.", style = "margin-bottom: -3px"),
|
805 |
+
Li("Converted all single new lines to whitespace. If whitespace was found after a new line with no text, the whitespace was removed. All leading and trailing whitespace was removed.", style = "margin-bottom: -3px"),
|
806 |
+
Li("All \f characters were removed.", style = "margin-bottom: -3px"),
|
807 |
+
),
|
808 |
P(B("Filters Applied: ")),
|
809 |
Ul(
|
810 |
Li("Language Filter: English", style = "margin-bottom: -3px"),
|
|
|
845 |
8. Comment1:
|
846 |
9. Comment2:
|
847 |
"""),
|
848 |
+
P(B("Unique Data Preperation Challenges: ")),
|
849 |
+
Ul(
|
850 |
+
Li("Handling code block was a required finding the specific blocks and exacting the details in one snippet.", style = "margin-bottom: -3px"),
|
851 |
+
Li("Question and Answer formatting had to be rewritten to match the question and the anwer.", style = "margin-bottom: -3px"),
|
852 |
+
Li("Occasionally a title was not included at the beginning of a question. For consistent formatting, a title was added.", style = "margin-bottom: -3px"),
|
853 |
+
),
|
854 |
P(B("Filters Applied: ")),
|
855 |
Ul(
|
856 |
Li("Minimum Word Count Filter: 10", style = "margin-bottom: -3px"),
|
|
|
889 |
def clean(x):
|
890 |
return '\n'.join('* ' + line[4:] if line.startswith('===') else line[8:] for line in x.split('\n'))
|
891 |
""", block="block", language="python" ),
|
892 |
+
P(B("Unique Data Preperation Challenges: ")),
|
893 |
+
Ul(
|
894 |
+
Li("Similar to the HackerNews challenges, we had to map comments and sub-comments to the original question.", style = "margin-bottom: -3px"),
|
895 |
+
),
|
896 |
P(B("Filters Applied: ")),
|
897 |
Ul(
|
898 |
Li("Language Filter: English", style = "margin-bottom: -3px"),
|
|
|
910 |
D_code("""
|
911 |
Question: TEXT
|
912 |
Answer: TEXT""", block="block", language="python"),
|
913 |
+
P(B("Unique Data Preperation Challenges: ")),
|
914 |
+
Ul(
|
915 |
+
Li("A byte string was included at the beginning of new lines", style = "margin-bottom: -3px"),
|
916 |
+
Li('No space before keyword "Answer:"', style = "margin-bottom: -3px"),
|
917 |
+
),
|
918 |
P(B("Filters Applied: ")),
|
919 |
Ul(
|
920 |
Li("No filtering was applied to DM Math", style = "margin-bottom: -3px"),
|
|
|
940 |
H3("PG-19"),
|
941 |
P("A collection of books from Project Gutenberg, a digital library of public domain works. This contains all the books that were published before 1919."),
|
942 |
P(B("Download and Extraction: "), "The dataset was downloaded directly from Huggingface: ", A("https://huggingface.co/datasets/deepmind/pg19", href="https://huggingface.co/datasets/deepmind/pg19"), "."),
|
943 |
+
P(B("Unique Data Preperation Challenges: ")),
|
944 |
+
Ul(
|
945 |
+
Li("Consecutive whitespaces were found spanning 10+ whitespace entries. These whitespaces were reduce to one, single whitespace.", style = "margin-bottom: -3px"),
|
946 |
+
Li("Consecutive new lines were found in some documents. All consecutive news over two were were reduce to two new lines.", style = "margin-bottom: -3px"),
|
947 |
+
Li("Delimiters such as * * * * * * * * ? were found. They were removed and replaced with whitespace.", style = "margin-bottom: -3px"),
|
948 |
+
),
|
949 |
P(B("Filters Applied:")),
|
950 |
Ul(
|
951 |
Li("Language Filter: English", style = "margin-bottom: -3px"),
|