[ [ "wudaocorpora", { "languages": [ { "ln_code": "zh", "dataset_name": "lm_zh_wudaocorpora", "size": 332.029883935, "--filters": "", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" } ], "total": 332.029883935, "hf_info": { "description": "WuDaoCorpora is a large-scale and high-quality data set constructed by Beijing Academy of Artificial Intelligence\n(BAAI), which is used to support the research of Wudao large pre-training model. Since the first release of\nWuDaoCorpora on 20th March , 2021, it has immediately attracted the attention of the industry. At present, 61 research\nteams from 41 enterprises, colleges and academies have applied for use.\nWuDaoCorpora2.0 contains the world\u2019s largest text data set, the world\u2019s largest multi-modal data set and the world\u2019s\nlargest Chinese dialogue data set. They are aiming at condensing Chinese language environment, establishing connection\nbetween text and image and concluding core rules of dialogue respectively. Our data set is multi-dimensional and\nworld-class, which is beneficial for the development of artificial general intelligence in China.\nWe open 200GB WDC-text data for academic research, while the full amount of WDC-text, WDC-ImageCaption and WDC-Dialogue\ndatasets only for WuDao team.\n", "citation": "@article{YUAN202165,\n title = {WuDaoCorpora: A super large-scale Chinese corpora for pre-training language models},\n journal = {AI Open},\n volume = {2},\n pages = {65-68},\n year = {2021},\n issn = {2666-6510},\n doi = {https://doi.org/10.1016/j.aiopen.2021.06.001},\n url = {https://www.sciencedirect.com/science/article/pii/S2666651021000152},\n author = {Sha Yuan and Hanyu Zhao and Zhengxiao Du and Ming Ding and Xiao Liu and Yukuo Cen and Xu Zou and Zhilin Yang and Jie Tang},\n keywords = {Pre-trained language models, Chinese corpus, Transformer-XL},\n}\n", "license": "For academic research", "homepage": "https://data.wudaoai.cn/", "hf_id": "wu_dao_corpora" }, "catalogue_info": { "uid": "wudaocorpora", "type": "processed", "description": { "name": "WuDaoCorpora", "description": "WuDaoCorpora is a super large-scale Chinese corpora for pre-training language models.\nThe base version of WuDaoCorpora contains about 200GB training data and 72 billion Chinese characters.", "homepage": "https://resource.wudaoai.cn/home", "validated": true }, "languages": { "language_names": [ "Chinese" ], "language_comments": "", "language_locations": [ "Eastern Asia", "China" ], "validated": false }, "custodian": { "name": "Beijing Academy of Artificial Intelligence", "in_catalogue": "", "type": "A university or research institution", "location": "China", "contact_name": "", "contact_email": "press@baai.ac.cn", "contact_submitter": false, "additional": "https://www.baai.ac.cn/", "validated": false }, "availability": { "procurement": { "for_download": "Yes - after signing a user agreement", "download_url": "https://resource.wudaoai.cn/home", "download_email": "" }, "licensing": { "has_licenses": "Yes", "license_text": "https://resource.wudaoai.cn/use-agreement", "license_properties": [ "non-commercial use" ], "license_list": [] }, "pii": { "has_pii": "Unclear", "generic_pii_likely": "", "generic_pii_list": [], "numeric_pii_likely": "", "numeric_pii_list": [], "sensitive_pii_likely": "", "sensitive_pii_list": [], "no_pii_justification_class": "other", "no_pii_justification_text": "In the paper WuDaoCorpora: A Super Large-scale Chinese Corporafor Pre-training Language Models (https://ks3-cn-beijing.ksyun.com/resources/WuDaoCorpora/WuDaoCorpora__A_Super_Large_scale_Chinese_Corporafor_Pre_training_Language_Models.pdf), the author claims that \"To protect everyone\u2019s privacy security to the greatest extent, we use Regular Expression to match private information (i.e., identity number, phone number, qq number, email address, etc.) and remove them from the dataset.\"" }, "validated": false }, "processed_from_primary": { "from_primary": "Taken from primary source", "primary_availability": "No - the dataset curators describe the primary sources but they are fully private", "primary_license": "", "primary_types": [], "validated": false, "from_primary_entries": [] }, "media": { "category": [ "text" ], "text_format": [ ".TXT" ], "audiovisual_format": [], "image_format": [], "database_format": [ ".JSON" ], "text_is_transcribed": "No", "instance_type": "post", "instance_count": "1M1000 characters \n - exclude GPL \n - filter_token_len_avg_std \n - filter_text_len \n - filter_special_character_ratio \n - filter_longest_line \n - filter_by_all \n" } ], [ "s2orc_ai2_pdf_parses", { "languages": [ { "ln_code": "en", "dataset_name": "lm_en_s2orc_ai2_pdf_parses", "size": 155.9089287497206, "--filters": "", "--dedups": "dedup_document filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" } ], "total": 155.9089287497206, "catalogue_info": { "uid": "s2orc_the_semantic_scholar_open_research_corpus", "type": "primary", "description": { "name": "S2ORC: The Semantic Scholar Open Research Corpus", "description": "Largest collection of machine-readable English-language open-access scientific literature formatted to support NLP research. 136M papers with titles and abstracts, including 12.7M papers with full text. Unifies popular resources like PubMed Central (Biomedicine) and arXiv (Physics, Math, CS) with papers sourced across many different academic disciplines. Maintained by the Semantic Scholar Research team at AI2. https://aclanthology.org/2020.acl-main.447/", "homepage": "https://github.com/allenai/s2orc", "validated": true }, "languages": { "language_names": [ "English" ], "language_comments": "", "language_locations": [ "World-Wide" ], "validated": false }, "custodian": { "name": "Semantic Scholar / Allen Institute for AI", "in_catalogue": "", "type": "A nonprofit/NGO (other)", "location": "United States of America", "contact_name": "Kyle Lo", "contact_email": "kylel@allenai.org", "contact_submitter": true, "additional": "http://allenai.org/", "validated": false }, "availability": { "procurement": { "for_download": "Yes - after signing a user agreement", "download_url": "https://docs.google.com/forms/d/1fUqUw68dDMnzFt58WgMi-FI33MPcVFpflN2G3Yjfn9c/edit", "download_email": "" }, "licensing": { "has_licenses": "Yes", "license_text": "", "license_properties": [ "non-commercial use" ], "license_list": [ "cc-by-nc-2.0: Creative Commons Attribution Non Commercial 2.0 Generic" ] }, "pii": { "has_pii": "Yes", "generic_pii_likely": "very likely", "generic_pii_list": [ "names", "email addresses", "physical addresses", "URLs", "website account name or handle" ], "numeric_pii_likely": "somewhat likely", "numeric_pii_list": [ "telephone numbers" ], "sensitive_pii_likely": "unlikely", "sensitive_pii_list": [], "no_pii_justification_class": "", "no_pii_justification_text": "" }, "validated": false }, "source_category": { "category_type": "collection", "category_web": "", "category_media": "scientific articles/journal", "validated": false }, "media": { "category": [ "text" ], "text_format": [ ".XHTML", ".TXT", ".CSV", ".TEX", "other", ".JSON" ], "audiovisual_format": [], "image_format": [ "other", ".PDF" ], "database_format": [ ".TAR", ".JSON", ".GZIP", ".TGZ" ], "text_is_transcribed": "Yes - image", "instance_type": "article", "instance_count": "1M10,000", "validated": false }, "fname": "open_subtitles.json" }, "data_card": "# Open Subtitles\n\n- Dataset uid: `open_subtitles`\n\n### Description\n\nA community repository for subtitles, with a total of 3.36 million subtitle files covering more than 60 languages\n\n### Homepage\n\nhttps://www.opensubtitles.com/en/home\n\n### Licensing\n\n\n\n### Speaker Locations\n\n- World-Wide\n\n\n### Sizes\n\n- 3.0150 % of total\n- 5.0599 % of en\n- 6.5686 % of ar\n- 13.5783 % of es\n- 13.1277 % of pt\n- 3.3240 % of fr\n- 0.4580 % of zh\n- 20.9593 % of id\n- 1.9182 % of vi\n- 1.1647 % of indic-ml\n- 0.2794 % of indic-bn\n- 1.4829 % of eu\n- 0.1543 % of ca\n- 0.0633 % of indic-hi\n- 0.0342 % of indic-ta\n- 0.1286 % of indic-ur\n- 0.0671 % of indic-te\n\n### BigScience processing steps\n\n#### Filters applied to: en\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: ar\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: es\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: pt\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: fr\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: zh\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: id\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: vi\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-ml\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-bn\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: eu\n\n- dedup_document\n- filter_remove_empty_docs\n\n#### Filters applied to: ca\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: indic-hi\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-ta\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-ur\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-te\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n" } ], [ "uncorpus", { "languages": [ { "ln_code": "ar", "dataset_name": "lm_ar_uncorpus", "size": 14.130924048, "--filters": "", "--dedups": "dedup_document filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "fr", "dataset_name": "lm_fr_uncorpus", "size": 5.966635998, "--filters": "", "--dedups": "dedup_document filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" }, { "ln_code": "es", "dataset_name": "lm_es_uncorpus", "size": 4.96533127, "--filters": "", "--dedups": "dedup_document filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" }, { "ln_code": "en", "dataset_name": "lm_en_uncorpus", "size": 4.569041804, "--filters": "", "--dedups": "dedup_document filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" }, { "ln_code": "zh", "dataset_name": "lm_zh_uncorpus", "size": 4.347032748, "--filters": "", "--dedups": "dedup_document filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" } ], "total": 33.978965868, "data_card": "# uncorpus\n\n- Dataset uid: `uncorpus`\n\n### Description\n\n\n\n### Homepage\n\n\n\n### Licensing\n\n\n\n### Speaker Locations\n\n\n\n### Sizes\n\n- 2.8023 % of total\n- 10.7390 % of ar\n- 5.7970 % of fr\n- 9.7477 % of es\n- 2.0417 % of en\n- 1.2540 % of zh\n\n### BigScience processing steps\n\n#### Filters applied to: ar\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: fr\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: es\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: en\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: zh\n\n- dedup_document\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n" } ], [ "wikisource_filtered", { "languages": [ { "ln_code": "fr", "dataset_name": "lm_fr_wikisource_filtered", "size": 13.162524173, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" }, { "ln_code": "indic-bn", "dataset_name": "lm_indic-bn_wikisource_filtered", "size": 5.244653795, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-ta", "dataset_name": "lm_indic-ta_wikisource_filtered", "size": 3.291679614, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "ar", "dataset_name": "lm_ar_wikisource_filtered", "size": 3.089359264, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-hi", "dataset_name": "lm_indic-hi_wikisource_filtered", "size": 1.300336649, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-te", "dataset_name": "lm_indic-te_wikisource_filtered", "size": 1.058453083, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "es", "dataset_name": "lm_es_wikisource_filtered", "size": 0.873833464, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" }, { "ln_code": "indic-kn", "dataset_name": "lm_indic-kn_wikisource_filtered", "size": 0.800927432, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "remove_wiki_mojibake", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-ml", "dataset_name": "lm_indic-ml_wikisource_filtered", "size": 0.662517523, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-mr", "dataset_name": "lm_indic-mr_wikisource_filtered", "size": 0.594944504, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-gu", "dataset_name": "lm_indic-gu_wikisource_filtered", "size": 0.400738545, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-as", "dataset_name": "lm_indic-as_wikisource_filtered", "size": 0.391661802, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "" }, { "ln_code": "pt", "dataset_name": "lm_pt_wikisource_filtered", "size": 0.266379594, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-pa", "dataset_name": "lm_indic-pa_wikisource_filtered", "size": 0.238759401, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "vi", "dataset_name": "lm_vi_wikisource_filtered", "size": 0.224057831, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "indic-or", "dataset_name": "lm_indic-or_wikisource_filtered", "size": 0.113016594, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "" }, { "ln_code": "ca", "dataset_name": "lm_ca_wikisource_filtered", "size": 0.076319472, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" }, { "ln_code": "id", "dataset_name": "lm_id_wikisource_filtered", "size": 0.075149951, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_300" }, { "ln_code": "eu", "dataset_name": "lm_eu_wikisource_filtered", "size": 0.031386521, "--filters": "filter_wiki_user_titles filter_wiki_non_text_type", "--dedups": "dedup_document dedup_template_soft filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "" } ], "total": 31.896699211999998, "data_card": "# wikisource_filtered\n\n- Dataset uid: `wikisource_filtered`\n\n### Description\n\n\n\n### Homepage\n\n\n\n### Licensing\n\n\n\n### Speaker Locations\n\n\n\n### Sizes\n\n- 2.6306 % of total\n- 12.7884 % of fr\n- 19.8886 % of indic-bn\n- 20.9966 % of indic-ta\n- 2.3478 % of ar\n- 4.7068 % of indic-hi\n- 18.0998 % of indic-te\n- 1.7155 % of es\n- 19.4800 % of indic-kn\n- 9.1737 % of indic-ml\n- 17.1771 % of indic-mr\n- 17.1870 % of indic-gu\n- 70.3687 % of indic-as\n- 1.0165 % of pt\n- 7.8642 % of indic-pa\n- 1.3501 % of vi\n- 4.9411 % of indic-or\n- 0.5307 % of ca\n- 2.3593 % of id\n- 1.5928 % of eu\n\n### BigScience processing steps\n\n#### Filters applied to: fr\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: indic-bn\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-ta\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: ar\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-hi\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-te\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: es\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: indic-kn\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- remove_wiki_mojibake\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-ml\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-mr\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-gu\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-as\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n\n#### Filters applied to: pt\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-pa\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: vi\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: indic-or\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n\n#### Filters applied to: ca\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_1024\n\n#### Filters applied to: id\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n#### Filters applied to: eu\n\n- filter_wiki_user_titles\n- filter_wiki_non_text_type\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n\n" } ], [ "stackexchange", { "languages": [ { "ln_code": "code", "dataset_name": "lm_code_stackexchange", "size": 27.54127769, "--filters": "", "--dedups": "dedup_document filter_remove_empty_docs", "--maps-and-filters argument": "", "--filter-short-documents": "filter_small_docs_bytes_1024" } ], "total": 27.54127769, "catalogue_info": { "uid": "stack_exchange_website", "type": "primary", "description": { "name": "Stack Exchange Website", "description": "Launched in 2010, the Stack Exchange network comprises 173 Q&A communities including Stack Overflow, the largest, most trusted online community for developers to learn, share their knowledge, and build their careers. ", "homepage": "https://stackexchange.com/", "validated": true }, "languages": { "language_names": [ "English" ], "language_comments": "", "language_locations": [ "Northern America" ], "validated": false }, "custodian": { "name": "Stack Exchange Inc.", "in_catalogue": "", "type": "A commercial entity", "location": "United States of America", "contact_name": "legal team", "contact_email": "legal@stackoverflow.com", "contact_submitter": false, "additional": "https://en.wikipedia.org/wiki/Stack_Exchange", "validated": false }, "availability": { "procurement": { "for_download": "No - we would need to spontaneously reach out to the current owners/custodians", "download_url": "", "download_email": "legal@stackoverflow.com" }, "licensing": { "has_licenses": "Yes", "license_text": "Subscriber Content\n\nYou agree that any and all content, including without limitation any and all text, graphics, logos, tools, photographs, images, illustrations, software or source code, audio and video, animations, and product feedback (collectively, \u201cContent\u201d) that you provide to the public Network (collectively, \u201cSubscriber Content\u201d), is perpetually and irrevocably licensed to Stack Overflow on a worldwide, royalty-free, non-exclusive basis pursuant to Creative Commons licensing terms (CC BY-SA 4.0), and you grant Stack Overflow the perpetual and irrevocable right and license to access, use, process, copy, distribute, export, display and to commercially exploit such Subscriber Content, even if such Subscriber Content has been contributed and subsequently removed by you as reasonably necessary to, for example (without limitation):\n\n Provide, maintain, and update the public Network\n Process lawful requests from law enforcement agencies and government agencies\n Prevent and address security incidents and data security features, support features, and to provide technical assistance as it may be required\n Aggregate data to provide product optimization\n\nThis means that you cannot revoke permission for Stack Overflow to publish, distribute, store and use such content and to allow others to have derivative rights to publish, distribute, store and use such content. The CC BY-SA 4.0 license terms are explained in further detail by Creative Commons, and the license terms applicable to content are explained in further detail here. You should be aware that all Public Content you contribute is available for public copy and redistribution, and all such Public Content must have appropriate attribution.\n\nAs stated above, by agreeing to these Public Network Terms you also agree to be bound by the terms and conditions of the Acceptable Use Policy incorporated herein, and hereby acknowledge and agree that any and all Public Content you provide to the public Network is governed by the Acceptable Use Policy.", "license_properties": [ "open license" ], "license_list": [ "cc-by-sa-4.0: Creative Commons Attribution Share Alike 4.0 International" ] }, "pii": { "has_pii": "Yes", "generic_pii_likely": "very likely", "generic_pii_list": [ "names", "website account name or handle", "email addresses" ], "numeric_pii_likely": "somewhat likely", "numeric_pii_list": [ "telephone numbers" ], "sensitive_pii_likely": "very likely", "sensitive_pii_list": [ "political opinions", "racial or ethnic origin", "religious or philosophical beliefs" ], "no_pii_justification_class": "", "no_pii_justification_text": "" }, "validated": false }, "source_category": { "category_type": "website", "category_web": "forum", "category_media": "", "validated": false }, "media": { "category": [ "text" ], "text_format": [ ".HTML" ], "audiovisual_format": [], "image_format": [], "database_format": [], "text_is_transcribed": "No", "instance_type": "post", "instance_count": "1M10,000", "validated": true }, "update_time": "10_10_2021__18_34_26", "fname": "ksucca_king_saud_university_corpus_of_classical_arabic-validated-10_10_2021__18_34_26.json" }, "data_card": "# KSUCCA King Saud University Corpus of Classical Arabic\n\n- Dataset uid: `ksucca`\n\n### Description\n\nKing Saud University Corpus of Classical Arabic (KSUCCA) is a pioneering 50 million tokens annotated corpus of Classical Arabic texts from the period of pre-Islamic era until the fourth Hijri century (equivalent to the period from the seventh until early eleventh century CE), which is the period of pure classical Arabic. The main aim of this corpus is to be used for studying the distributional lexical semantics of The Quran words. However, it can be used for other research purposes, such as:\n\u2022 Arabic linguistics, which includes: lexical, morphological, syntactic, semantic and pragmatic research.\n\u2022 Arabic computational linguistics, which includes: lexical, morphological, syntactic, semantic and pragmatic research including their various applications.\n\u2022 Arabic language teaching for both Arabs and non Arabs.\n\u2022 Artificial intelligence.\n\u2022 Natural language processing.\n\u2022 Information retrieval.\n\u2022 Question answering.\n\u2022 Machine translation.\n\n### Homepage\n\nhttps://sourceforge.net/projects/ksucca-corpus/\n\n### Licensing\n\n- open license\n- non-commercial use\n- cc-by-nc-2.0: Creative Commons Attribution Non Commercial 2.0 Generic\n\n\n### Speaker Locations\n\n- Middle East and North Africa\n\n\n### Sizes\n\n- 0.1033 % of total\n- 0.9524 % of ar\n\n### BigScience processing steps\n\n#### Filters applied to: ar\n\n- dedup_document\n- dedup_template_soft\n- filter_remove_empty_docs\n- filter_small_docs_bytes_300\n\n" } ], [ "pseudocrawl-filtered_63_www_lanacion_com_ar", { "languages": [ { "ln_code": "es", "dataset_name": "lm_es_pseudocrawl-filtered_63_www_lanacion_com_ar", "size": 1.247608816, "--filters": "", "--dedups": "dedup_document_on_url dedup_document dedup_pseudocrawl_newspapers filter_remove_empty_docs", "--maps-and-filters argument": "remove_lines_with_code", "--filter-short-documents": "filter_small_docs_bytes_1024" } ], "total": 1.247608816, "seed_info": { "domain": "http://www.lanacion.com.ar/", "title": "la nacion (argentina)", "license": "unknown", "location": "argentina", "subject": "general news", "owner": "la nacion (argentina)" }, "data_card": "# la nacion (argentina)\n\n- Dataset uid: `pseudocrawl-filtered_63_www_lanacion_com_ar`\n\n### Description\n\nwebsite: argentina -general news\n\n### Homepage\n\nhttp://www.lanacion.com.ar/\n\n### Licensing\n\n\n\n### Speaker Locations\n\nargentina\n\n### Sizes\n\n- 0.1029 % of total\n- 2.4492 % of es\n\n### BigScience processing steps\n\n#### Filters applied to: es\n\n- dedup_document_on_url\n- dedup_document\n- dedup_pseudocrawl_newspapers\n- filter_remove_empty_docs\n- remove_lines_with_code\n- filter_small_docs_bytes_1024\n\n" } ], [ "pseudocrawl-filtered_20_www_clarin_com", { "languages": [ { "ln_code": "es", "dataset_name": "lm_es_pseudocrawl-filtered_20_www_clarin_com", "size": 1.223161538, "--filters": "", "--dedups": "dedup_document_on_url dedup_document dedup_pseudocrawl_newspapers filter_remove_empty_docs", "--maps-and-filters argument": "remove_lines_with_code", "--filter-short-documents": "filter_small_docs_bytes_1024" } ], "total": 1.223161538, "seed_info": { "domain": "http://www.clarin.com/", "title": "Clar\u00edn - Argentina", "license": "unknown", "location": "Argentina", "subject": "General News", "owner": "Clar\u00edn - Argentina" }, "catalogue_info": { "uid": "clar\u00edn_argentina", "type": "primary", "description": { "name": "Clar\u00edn - Argentina", "description": "Is the largest newspaper in Argentina and the second most circulated in the Spanish-speaking world.", "homepage": "https://www.clarin.com/", "validated": true }, "languages": { "language_names": [ "Spanish" ], "language_comments": "", "language_locations": [ "Latin America and the Caribbean", "Argentina" ], "validated": false }, "custodian": { "name": "Grupo Clar\u00edn", "in_catalogue": "", "type": "A commercial entity", "location": "Argentina", "contact_name": "GRUPO CLAR\u00cdN S.A", "contact_email": "institucional@grupoclarin.com", "contact_submitter": false, "additional": "https://es.wikipedia.org/wiki/Clar%C3%ADn_(peri%C3%B3dico)", "validated": false }, "availability": { "procurement": { "for_download": "No - we would need to spontaneously reach out to the current owners/custodians", "download_url": "", "download_email": "institucional@grupoclarin.com" }, "licensing": { "has_licenses": "Yes", "license_text": "Se permite \u00fanicamente un uso personal e intransferible de las claves de acceso a los Sitios. Todos los contenidos de los Sitios pertenecen a AGEA o, en su caso, a terceras personas y est\u00e1n protegidos por la legislaci\u00f3n sobre propiedad intelectual. Ning\u00fan contenido de los Sitios, cualquiera que sea su naturaleza, podr\u00e1 ser bajado, publicado, emitido, retransmitido directa o indirectamente en ning\u00fan medio o soporte para uso distinto del estrictamente personal. Por tanto, queda terminantemente prohibida su utilizaci\u00f3n con fines comerciales, su distribuci\u00f3n, as\u00ed como su modificaci\u00f3n, alteraci\u00f3n o descompilaci\u00f3n. Al acceder a los Sitios el Usuario acepta no vender, no publicar, no distribuir, no retransmitir ni facilitar ning\u00fan acceso a los contenidos de los Sitios a terceros. El Usuario acepta no utilizar los Sitios para ning\u00fan prop\u00f3sito ilegal.AGEA se reserva el derecho de restringir o cancelar el acceso a los Sitios si, a su criterio y consideraci\u00f3n, el Usuario utiliza los Sitios para infringir alguna ley, violar derechos de terceros o incumplir las presentes condiciones de contrataci\u00f3n. En caso de producirse descargas masivas de contenidos por parte de un Usuario, AGEA se reserva el derecho de cancelar el acceso a los Sitios de dicho Usuario, anular su suscripci\u00f3n, y/o adoptar las acciones legales que estime oportunas.Asimismo, AGEA no se responsabiliza por los nombres de Usuarios que afectan a personas ajenas, est\u00e1n protegidos por marcas registradas u otras leyes o que resultaren vulgares u ofensivos. Los Usuarios aceptan y reconocen que AGEA no controla, ni supervisa, ni asume responsabilidad alguna por la calidad, seguridad, caracter\u00edsticas y dem\u00e1s elementos de los productos y/o servicios promocionados u ofrecidos en los Sitios.Asimismo, aceptan y reconocen que AGEA no controla, ni supervisa, ni asume responsabilidad acerca de la veracidad y exactitud de la descripci\u00f3n efectuada por aquellos que promocionan u ofrecen sus productos o servicios (en adelante los \"Oferentes\"), ni acerca del cumplimiento de los requisitos legales para ofrecer y vender los productos o servicios, ni sobre la capacidad y legitimaci\u00f3n de los Oferentes para promocionar, ofrecer y/o vender sus bienes o servicios. El material publicitario es propiedad de los solicitantes del espacio. AGEA no es responsable del contenido y al respecto rigen las mismas exclusiones que para el material editorial.", "license_properties": [ "copyright - all rights reserved" ], "license_list": [ "other: Other license" ] }, "pii": { "has_pii": "No", "generic_pii_likely": "", "generic_pii_list": [], "numeric_pii_likely": "", "numeric_pii_list": [], "sensitive_pii_likely": "", "sensitive_pii_list": [], "no_pii_justification_class": "general knowledge not written by or referring to private persons", "no_pii_justification_text": "" }, "validated": false }, "source_category": { "category_type": "website", "category_web": "news or magazine website", "category_media": "", "validated": false }, "media": { "category": [ "text", "image" ], "text_format": [ ".HTML" ], "audiovisual_format": [], "image_format": [ ".JPG" ], "database_format": [], "text_is_transcribed": "No", "instance_type": "article", "instance_count": "", "instance_size": "10010,000", "validated": false }, "fname": "detik_com.json" }, "data_card": "# detik.com\n\n- Dataset uid: `pseudocrawl-filtered_545_www_detik_com`\n\n### Description\n\nDetikcom (stylized as detikcom) is an Indonesian digital media company owned by CT Corp subsidiary Trans Media. Detikcom is an online news portal and publishes breaking news. The portal is consistently ranked among Indonesia's 10 most-visited websites and is among the top 250 in the world. It receives approximately 180 million visits per day.\n\n### Homepage\n\nhttps://www.detik.com/\n\n### Licensing\n\n- copyright - all rights reserved\n- other: Other license\n\nhttps://www.detik.com/copyright\n\n\n### Speaker Locations\n\n- Asia\n- Indonesia\n\n\n### Sizes\n\n- 0.0051 % of total\n- 1.9450 % of id\n\n### BigScience processing steps\n\n#### Filters applied to: id\n\n- dedup_document_on_url\n- dedup_document\n- dedup_pseudocrawl_newspapers\n- filter_remove_empty_docs\n- remove_lines_with_code\n- filter_small_docs_bytes_300\n\n" } ], [ "pseudocrawl-filtered_499_www_today_com_news", { "languages": [ { "ln_code": "en", "dataset_name": "lm_en_pseudocrawl-filtered_499_www_today_com_news", "size": 0.061553826, "--filters": "", "--dedups": "dedup_document_on_url dedup_document dedup_pseudocrawl_newspapers filter_remove_empty_docs", "--maps-and-filters argument": "remove_lines_with_code", "--filter-short-documents": "filter_small_docs_bytes_1024" } ], "total": 0.061553826, "seed_info": { "domain": "https://www.today.com/news/", "title": "News outlet", "license": "unknown", "location": "singapore", "subject": "news", "owner": "" }, "data_card": "# News outlet\n\n- Dataset uid: `pseudocrawl-filtered_499_www_today_com_news`\n\n### Description\n\nwebsite: singapore -news\n\n### Homepage\n\nhttps://www.today.com/news/\n\n### Licensing\n\n\n\n### Speaker Locations\n\nsingapore\n\n### Sizes\n\n- 0.0051 % of total\n- 0.0275 % of en\n\n### BigScience processing steps\n\n#### Filters applied to: en\n\n- dedup_document_on_url\n- dedup_document\n- dedup_pseudocrawl_newspapers\n- filter_remove_empty_docs\n- remove_lines_with_code\n- filter_small_docs_bytes_1024\n\n" } ], [ "pseudocrawl-filtered_548_remezcla_com", { "languages": [ { "ln_code": "en", "dataset_name": "lm_en_pseudocrawl-filtered_548_remezcla_com", "size": 0.061434686, "--filters": "", "--dedups": "dedup_document_on_url dedup_document dedup_pseudocrawl_newspapers filter_remove_empty_docs", "--maps-and-filters argument": "remove_lines_with_code", "--filter-short-documents": "filter_small_docs_bytes_1024" } ], "total": 0.061434686, "catalogue_info": { "uid": "remezcla_com", "type": "primary", "description": { "name": "Remezcla.com", "description": "Remezcla is an American media company focusing on the Latin American cultural sphere. It serves the millennial market.\n\nRemezcla started as a grassroots project among writers and creatives that was led by co-founders Claire Frisbie, Nuria Net, and Andrew Herrera. We shared one common point of view: there were so many great stories about new Latin music, culture, and events that no one was covering. Traditional Latin media was not for us. We were called \u201calternative,\u201d but to us, what we were covering was our new mainstream. Along the way we met so many like-minded friends in other cities and countries that it sparked a movement. Answering \u201cWhat is Remezcla?\u201d is difficult for me because what started in living rooms and coffee shops among friends has grown to be so much more; today we reach millions of readers and have built a brand that goes beyond our publication.", "homepage": "https://remezcla.com/", "validated": true }, "languages": { "language_names": [ "English" ], "language_comments": "", "language_locations": [ "United States of America" ], "validated": false }, "custodian": { "name": "Remezcla", "in_catalogue": "", "type": "A commercial entity", "location": "United States of America", "contact_name": "Remezcla.com", "contact_email": "info@remezcla.com", "contact_submitter": true, "additional": "https://en.wikipedia.org/wiki/Remezcla", "validated": false }, "availability": { "procurement": { "for_download": "No - we would need to spontaneously reach out to the current owners/custodians", "download_url": "", "download_email": "info@remezcla.com" }, "licensing": { "has_licenses": "Yes", "license_text": "https://remezcla.com/terms-conditions/", "license_properties": [ "copyright - all rights reserved" ], "license_list": [ "other: Other license" ] }, "pii": { "has_pii": "Yes", "generic_pii_likely": "very likely", "generic_pii_list": [ "names", "email addresses", "website account name or handle", "full-face photographs and comparable images", "dates (birth, death, etc.)" ], "numeric_pii_likely": "somewhat likely", "numeric_pii_list": [ "telephone numbers" ], "sensitive_pii_likely": "very likely", "sensitive_pii_list": [ "racial or ethnic origin", "political opinions", "religious or philosophical beliefs", "data concerning a person's sex life or sexual orientation" ], "no_pii_justification_class": "", "no_pii_justification_text": "" }, "validated": false }, "source_category": { "category_type": "website", "category_web": "news or magazine website", "category_media": "", "validated": false }, "media": { "category": [ "text" ], "text_format": [ ".HTML" ], "audiovisual_format": [], "image_format": [], "database_format": [], "text_is_transcribed": "No", "instance_type": "article", "instance_count": "1K