lakshmi97
/

bert-preprocessed-tokens

Model card Files Files and versions Community

lakshmi97 commited on Nov 21, 2023

Commit

b2659cd

•

1 Parent(s): 07c0de9

Delete wikiTokenisedValid

Browse files

Files changed (4) hide show

wikiTokenisedValid/data-00000-of-00001.arrow +0 -3
wikiTokenisedValid/dataset_info.json +0 -75
wikiTokenisedValid/readme.md +0 -0
wikiTokenisedValid/state.json +0 -13

wikiTokenisedValid/data-00000-of-00001.arrow DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1757fade9c380370679721c0adebfa70a535cfe7fcf73950b3b921616d6f97bd
-size 2458880

wikiTokenisedValid/dataset_info.json DELETED Viewed

@@ -1,75 +0,0 @@
-{
-  "builder_name": "wikitext",
-  "citation": "@misc{merity2016pointer,\n      title={Pointer Sentinel Mixture Models},\n      author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},\n      year={2016},\n      eprint={1609.07843},\n      archivePrefix={arXiv},\n      primaryClass={cs.CL}\n}\n",
-  "config_name": "wikitext-103-raw-v1",
-  "dataset_name": "wikitext",
-  "dataset_size": 548965325,
-  "description": " The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified\n Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike\n License.\n",
-  "download_checksums": {
-    "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip": {
-      "num_bytes": 191984949,
-      "checksum": null
-    }
-  },
-  "download_size": 191984949,
-  "features": {
-    "input_ids": {
-      "feature": {
-        "dtype": "int32",
-        "_type": "Value"
-      },
-      "_type": "Sequence"
-    },
-    "token_type_ids": {
-      "feature": {
-        "dtype": "int8",
-        "_type": "Value"
-      },
-      "_type": "Sequence"
-    },
-    "attention_mask": {
-      "feature": {
-        "dtype": "int8",
-        "_type": "Value"
-      },
-      "_type": "Sequence"
-    },
-    "next_sentence_label": {
-      "dtype": "int64",
-      "_type": "Value"
-    }
-  },
-  "homepage": "https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/",
-  "license": "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)",
-  "size_in_bytes": 740950274,
-  "splits": {
-    "test": {
-      "name": "test",
-      "num_bytes": 1305088,
-      "num_examples": 4358,
-      "dataset_name": "wikitext"
-    },
-    "train": {
-      "name": "train",
-      "num_bytes": 546500949,
-      "num_examples": 1801350,
-      "shard_lengths": [
-        1649000,
-        152350
-      ],
-      "dataset_name": "wikitext"
-    },
-    "validation": {
-      "name": "validation",
-      "num_bytes": 1159288,
-      "num_examples": 3760,
-      "dataset_name": "wikitext"
-    }
-  },
-  "version": {
-    "version_str": "1.0.0",
-    "major": 1,
-    "minor": 0,
-    "patch": 0
-  }
-}

wikiTokenisedValid/readme.md DELETED Viewed

File without changes

wikiTokenisedValid/state.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "_data_files": [
-    {
-      "filename": "data-00000-of-00001.arrow"
-    }
-  ],
-  "_fingerprint": "faf39447b7393dbe",
-  "_format_columns": null,
-  "_format_kwargs": {},
-  "_format_type": null,
-  "_output_all_columns": false,
-  "_split": "validation"
-}