{"_id":"621ffdd236468d709f18200d","id":"Salesforce/wikitext","author":"Salesforce","sha":"b08601e04326c79dfdd32d625aee71d232d685c3","lastModified":"2024-01-04T16:49:18.000Z","private":false,"gated":false,"disabled":false,"tags":["task_categories:text-generation","task_categories:fill-mask","task_ids:language-modeling","task_ids:masked-language-modeling","annotations_creators:no-annotation","language_creators:crowdsourced","multilinguality:monolingual","source_datasets:original","language:en","license:cc-by-sa-3.0","license:gfdl","size_categories:1M<n<10M","format:parquet","modality:text","library:datasets","library:dask","library:polars","library:mlcroissant","arxiv:1609.07843","region:us"],"description":"\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for \"wikitext\"\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Summary\n\t\n\n The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.\nCompared to the preprocessed version of Penn Treebank (PTB), WikiText-2 is over 2 times larger and WikiText-103 is over\n110 times larger. The WikiText dataset also features a far… See the full description on the dataset page: https://huggingface.co/datasets/Salesforce/wikitext.","paperswithcode_id":"wikitext-2","downloads":1342199,"likes":700,"cardData":{"annotations_creators":["no-annotation"],"language_creators":["crowdsourced"],"language":["en"],"license":["cc-by-sa-3.0","gfdl"],"multilinguality":["monolingual"],"size_categories":["1M<n<10M"],"source_datasets":["original"],"task_categories":["text-generation","fill-mask"],"task_ids":["language-modeling","masked-language-modeling"],"paperswithcode_id":"wikitext-2","pretty_name":"WikiText","dataset_info":[{"config_name":"wikitext-103-raw-v1","features":[{"name":"text","dtype":"string"}],"splits":[{"name":"test","num_bytes":1305088,"num_examples":4358},{"name":"train","num_bytes":546500949,"num_examples":1801350},{"name":"validation","num_bytes":1159288,"num_examples":3760}],"download_size":315466397,"dataset_size":548965325},{"config_name":"wikitext-103-v1","features":[{"name":"text","dtype":"string"}],"splits":[{"name":"test","num_bytes":1295575,"num_examples":4358},{"name":"train","num_bytes":545141915,"num_examples":1801350},{"name":"validation","num_bytes":1154751,"num_examples":3760}],"download_size":313093838,"dataset_size":547592241},{"config_name":"wikitext-2-raw-v1","features":[{"name":"text","dtype":"string"}],"splits":[{"name":"test","num_bytes":1305088,"num_examples":4358},{"name":"train","num_bytes":11061717,"num_examples":36718},{"name":"validation","num_bytes":1159288,"num_examples":3760}],"download_size":7747362,"dataset_size":13526093},{"config_name":"wikitext-2-v1","features":[{"name":"text","dtype":"string"}],"splits":[{"name":"test","num_bytes":1270947,"num_examples":4358},{"name":"train","num_bytes":10918118,"num_examples":36718},{"name":"validation","num_bytes":1134123,"num_examples":3760}],"download_size":7371282,"dataset_size":13323188}],"configs":[{"config_name":"wikitext-103-raw-v1","data_files":[{"split":"test","path":"wikitext-103-raw-v1/test-*"},{"split":"train","path":"wikitext-103-raw-v1/train-*"},{"split":"validation","path":"wikitext-103-raw-v1/validation-*"}]},{"config_name":"wikitext-103-v1","data_files":[{"split":"test","path":"wikitext-103-v1/test-*"},{"split":"train","path":"wikitext-103-v1/train-*"},{"split":"validation","path":"wikitext-103-v1/validation-*"}]},{"config_name":"wikitext-2-raw-v1","data_files":[{"split":"test","path":"wikitext-2-raw-v1/test-*"},{"split":"train","path":"wikitext-2-raw-v1/train-*"},{"split":"validation","path":"wikitext-2-raw-v1/validation-*"}]},{"config_name":"wikitext-2-v1","data_files":[{"split":"test","path":"wikitext-2-v1/test-*"},{"split":"train","path":"wikitext-2-v1/train-*"},{"split":"validation","path":"wikitext-2-v1/validation-*"}]}]},"siblings":[{"rfilename":".gitattributes"},{"rfilename":"README.md"},{"rfilename":"wikitext-103-raw-v1/test-00000-of-00001.parquet"},{"rfilename":"wikitext-103-raw-v1/train-00000-of-00002.parquet"},{"rfilename":"wikitext-103-raw-v1/train-00001-of-00002.parquet"},{"rfilename":"wikitext-103-raw-v1/validation-00000-of-00001.parquet"},{"rfilename":"wikitext-103-v1/test-00000-of-00001.parquet"},{"rfilename":"wikitext-103-v1/train-00000-of-00002.parquet"},{"rfilename":"wikitext-103-v1/train-00001-of-00002.parquet"},{"rfilename":"wikitext-103-v1/validation-00000-of-00001.parquet"},{"rfilename":"wikitext-2-raw-v1/test-00000-of-00001.parquet"},{"rfilename":"wikitext-2-raw-v1/train-00000-of-00001.parquet"},{"rfilename":"wikitext-2-raw-v1/validation-00000-of-00001.parquet"},{"rfilename":"wikitext-2-v1/test-00000-of-00001.parquet"},{"rfilename":"wikitext-2-v1/train-00000-of-00001.parquet"},{"rfilename":"wikitext-2-v1/validation-00000-of-00001.parquet"}],"createdAt":"2022-03-02T23:29:22.000Z","usedStorage":11667545537}