koichi12 commited on
Commit
4c8cf60
·
verified ·
1 Parent(s): 097c29a

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/eval/lm-evaluation-harness/.github/workflows/pull_request.yml +13 -0
  2. scripts/yans/eval/lm-evaluation-harness/.github/workflows/python-app.yml +50 -0
  3. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
  4. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__init__.py +0 -0
  5. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/__init__.cpython-310.pyc +0 -0
  6. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/asdiv.cpython-310.pyc +0 -0
  7. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/asdiv.py +111 -0
  8. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/dataset_infos.json +1 -0
  9. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__init__.py +0 -0
  10. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/__init__.cpython-310.pyc +0 -0
  11. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/coqa.cpython-310.pyc +0 -0
  12. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/coqa.py +245 -0
  13. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/dataset_infos.json +1 -0
  14. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__init__.py +0 -0
  15. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/__init__.cpython-310.pyc +0 -0
  16. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/headqa.cpython-310.pyc +0 -0
  17. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/dataset_infos.json +1 -0
  18. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/headqa.py +162 -0
  19. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__init__.py +0 -0
  20. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/__init__.cpython-310.pyc +0 -0
  21. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/hendrycks_ethics.cpython-310.pyc +0 -0
  22. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/dataset_infos.json +1 -0
  23. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py +229 -0
  24. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/__init__.cpython-310.pyc +0 -0
  25. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__init__.py +0 -0
  26. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/__init__.cpython-310.pyc +0 -0
  27. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/lambada_ja.cpython-310.pyc +0 -0
  28. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/lambada_ja.py +147 -0
  29. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__init__.py +0 -0
  30. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/__init__.cpython-310.pyc +0 -0
  31. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/logiqa.cpython-310.pyc +0 -0
  32. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/dataset_infos.json +1 -0
  33. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/logiqa.py +124 -0
  34. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__init__.py +0 -0
  35. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/__init__.cpython-310.pyc +0 -0
  36. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/mutual.cpython-310.pyc +0 -0
  37. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/dataset_infos.json +1 -0
  38. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/mutual.py +136 -0
  39. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__init__.py +0 -0
  40. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/__init__.cpython-310.pyc +0 -0
  41. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/pile.cpython-310.pyc +0 -0
  42. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/dataset_infos.json +1 -0
  43. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/pile.py +126 -0
  44. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__init__.py +0 -0
  45. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/__init__.cpython-310.pyc +0 -0
  46. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/quac.cpython-310.pyc +0 -0
  47. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/dataset_infos.json +1 -0
  48. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/quac.py +117 -0
  49. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__init__.py +0 -0
  50. scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/yans/eval/lm-evaluation-harness/.github/workflows/pull_request.yml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Pull Request
2
+
3
+ on: [pull_request]
4
+
5
+ jobs:
6
+ pre-commit:
7
+ runs-on: ubuntu-20.04
8
+ steps:
9
+ - uses: actions/checkout@v3
10
+ - uses: actions/setup-python@v4
11
+ with:
12
+ python-version: 3.8
13
+ - uses: pre-commit/action@v2.0.3
scripts/yans/eval/lm-evaluation-harness/.github/workflows/python-app.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3
+
4
+ name: Build
5
+
6
+ on:
7
+ push:
8
+ branches: [ master ]
9
+ pull_request:
10
+ branches: [ master ]
11
+
12
+ jobs:
13
+ build:
14
+
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - uses: actions/checkout@v3
19
+ - name: Cache
20
+ uses: actions/cache@v2.1.3
21
+ with:
22
+ # A list of files, directories, and wildcard patterns to cache and restore
23
+ path: |
24
+ ~/.cache
25
+ # An explicit key for restoring and saving the cache
26
+ key: evaldata-cache-4
27
+ - name: Set up Python 3.9
28
+ uses: actions/setup-python@v4
29
+ with:
30
+ python-version: 3.9
31
+ - name: Install dependencies
32
+ run: |
33
+ python -m pip install --upgrade pip
34
+ pip install flake8 pytest pytest-cov
35
+ pip install -e .[dev,multilingual]
36
+ # Install optional git dependencies
37
+ pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
38
+ if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
39
+ - name: Lint with flake8
40
+ run: |
41
+ # stop the build if there are Python syntax errors or undefined names
42
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
43
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
44
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
45
+ - name: Test with pytest
46
+ run: |
47
+ pytest -vv --cov=lm_eval/ tests/
48
+ - name: Upload to codecov
49
+ run: |
50
+ bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (170 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (176 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/__pycache__/asdiv.cpython-310.pyc ADDED
Binary file (3.12 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/asdiv.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """ASDIV dataset."""
15
+
16
+
17
+ import os
18
+ import xml.etree.ElementTree as ET
19
+
20
+ import datasets
21
+
22
+
23
+ _CITATION = """\
24
+ @misc{miao2021diverse,
25
+ title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
26
+ author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
27
+ year={2021},
28
+ eprint={2106.15772},
29
+ archivePrefix={arXiv},
30
+ primaryClass={cs.AI}
31
+ }
32
+ """
33
+
34
+ _DESCRIPTION = """\
35
+ ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
36
+ patterns and problem types) English math word problem (MWP) corpus for evaluating
37
+ the capability of various MWP solvers. Existing MWP corpora for studying AI progress
38
+ remain limited either in language usage patterns or in problem types. We thus present
39
+ a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
40
+ types taught in elementary school. Each MWP is annotated with its problem type and grade
41
+ level (for indicating the level of difficulty).
42
+ """
43
+
44
+ _HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
45
+
46
+ # TODO: Add the licence for the dataset here if you can find it
47
+ _LICENSE = ""
48
+
49
+ _URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
50
+
51
+
52
+ class ASDiv(datasets.GeneratorBasedBuilder):
53
+ """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
54
+
55
+ VERSION = datasets.Version("0.0.1")
56
+
57
+ BUILDER_CONFIGS = [
58
+ datasets.BuilderConfig(
59
+ name="asdiv",
60
+ version=VERSION,
61
+ description="A diverse corpus for evaluating and developing english math word problem solvers",
62
+ )
63
+ ]
64
+
65
+ def _info(self):
66
+ features = datasets.Features(
67
+ {
68
+ "body": datasets.Value("string"),
69
+ "question": datasets.Value("string"),
70
+ "solution_type": datasets.Value("string"),
71
+ "answer": datasets.Value("string"),
72
+ "formula": datasets.Value("string"),
73
+ }
74
+ )
75
+ return datasets.DatasetInfo(
76
+ description=_DESCRIPTION,
77
+ features=features,
78
+ homepage=_HOMEPAGE,
79
+ license=_LICENSE,
80
+ citation=_CITATION,
81
+ )
82
+
83
+ def _split_generators(self, dl_manager):
84
+ urls = _URLS
85
+ data_dir = dl_manager.download_and_extract(urls)
86
+ base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
87
+ return [
88
+ datasets.SplitGenerator(
89
+ name=datasets.Split.VALIDATION,
90
+ # These kwargs will be passed to _generate_examples
91
+ gen_kwargs={
92
+ "filepath": os.path.join(
93
+ data_dir, base_filepath, "dataset", "ASDiv.xml"
94
+ ),
95
+ "split": datasets.Split.VALIDATION,
96
+ },
97
+ ),
98
+ ]
99
+
100
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
101
+ def _generate_examples(self, filepath, split):
102
+ tree = ET.parse(filepath)
103
+ root = tree.getroot()
104
+ for key, problem in enumerate(root.iter("Problem")):
105
+ yield key, {
106
+ "body": problem.find("Body").text,
107
+ "question": problem.find("Question").text,
108
+ "solution_type": problem.find("Solution-Type").text,
109
+ "answer": problem.find("Answer").text,
110
+ "formula": problem.find("Formula").text,
111
+ }
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/asdiv/dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n year={2021},\n eprint={2106.15772},\n archivePrefix={arXiv},\n primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (175 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/__pycache__/coqa.cpython-310.pyc ADDED
Binary file (4.18 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/coqa.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """CoQA dataset.
15
+
16
+ This `CoQA` adds the "additional_answers" feature that's missing in the original
17
+ datasets version:
18
+ https://github.com/huggingface/datasets/blob/master/datasets/coqa/coqa.py
19
+ """
20
+
21
+
22
+ import json
23
+
24
+ import datasets
25
+
26
+
27
+ _CITATION = """\
28
+ @misc{reddy2018coqa,
29
+ title={CoQA: A Conversational Question Answering Challenge},
30
+ author={Siva Reddy and Danqi Chen and Christopher D. Manning},
31
+ year={2018},
32
+ eprint={1808.07042},
33
+ archivePrefix={arXiv},
34
+ primaryClass={cs.CL}
35
+ }
36
+ """
37
+
38
+ _DESCRIPTION = """\
39
+ CoQA is a large-scale dataset for building Conversational Question Answering
40
+ systems. The goal of the CoQA challenge is to measure the ability of machines to
41
+ understand a text passage and answer a series of interconnected questions that
42
+ appear in a conversation.
43
+ """
44
+
45
+ _HOMEPAGE = "https://stanfordnlp.github.io/coqa/"
46
+
47
+ # TODO: Add the licence for the dataset here if you can find it
48
+ _LICENSE = ""
49
+
50
+ _URLS = {
51
+ "train": "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json",
52
+ "validation": "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json",
53
+ }
54
+
55
+ # `additional_answers` are not available in the train set so we fill them with
56
+ # empty dicts of the same form.
57
+ _EMPTY_ADDITIONAL_ANSWER = {
58
+ "0": [
59
+ {
60
+ "span_start": -1,
61
+ "span_end": -1,
62
+ "span_text": "",
63
+ "input_text": "",
64
+ "turn_id": -1,
65
+ }
66
+ ],
67
+ "1": [
68
+ {
69
+ "span_start": -1,
70
+ "span_end": -1,
71
+ "span_text": "",
72
+ "input_text": "",
73
+ "turn_id": -1,
74
+ }
75
+ ],
76
+ "2": [
77
+ {
78
+ "span_start": -1,
79
+ "span_end": -1,
80
+ "span_text": "",
81
+ "input_text": "",
82
+ "turn_id": -1,
83
+ }
84
+ ],
85
+ }
86
+
87
+
88
+ class Coqa(datasets.GeneratorBasedBuilder):
89
+ """CoQA is a large-scale dataset for building Conversational Question Answering systems."""
90
+
91
+ VERSION = datasets.Version("0.0.1")
92
+
93
+ BUILDER_CONFIGS = [
94
+ datasets.BuilderConfig(
95
+ name="coqa", version=VERSION, description="The CoQA dataset."
96
+ ),
97
+ ]
98
+
99
+ def _info(self):
100
+ features = datasets.Features(
101
+ {
102
+ "id": datasets.Value("string"),
103
+ "source": datasets.Value("string"),
104
+ "story": datasets.Value("string"),
105
+ "questions": datasets.features.Sequence(
106
+ {
107
+ "input_text": datasets.Value("string"),
108
+ "turn_id": datasets.Value("int32"),
109
+ }
110
+ ),
111
+ "answers": datasets.features.Sequence(
112
+ {
113
+ "span_start": datasets.Value("int32"),
114
+ "span_end": datasets.Value("int32"),
115
+ "span_text": datasets.Value("string"),
116
+ "input_text": datasets.Value("string"),
117
+ "turn_id": datasets.Value("int32"),
118
+ }
119
+ ),
120
+ "additional_answers": {
121
+ "0": datasets.features.Sequence(
122
+ {
123
+ "span_start": datasets.Value("int32"),
124
+ "span_end": datasets.Value("int32"),
125
+ "span_text": datasets.Value("string"),
126
+ "input_text": datasets.Value("string"),
127
+ "turn_id": datasets.Value("int32"),
128
+ }
129
+ ),
130
+ "1": datasets.features.Sequence(
131
+ {
132
+ "span_start": datasets.Value("int32"),
133
+ "span_end": datasets.Value("int32"),
134
+ "span_text": datasets.Value("string"),
135
+ "input_text": datasets.Value("string"),
136
+ "turn_id": datasets.Value("int32"),
137
+ }
138
+ ),
139
+ "2": datasets.features.Sequence(
140
+ {
141
+ "span_start": datasets.Value("int32"),
142
+ "span_end": datasets.Value("int32"),
143
+ "span_text": datasets.Value("string"),
144
+ "input_text": datasets.Value("string"),
145
+ "turn_id": datasets.Value("int32"),
146
+ }
147
+ ),
148
+ },
149
+ }
150
+ )
151
+ return datasets.DatasetInfo(
152
+ description=_DESCRIPTION,
153
+ features=features,
154
+ homepage=_HOMEPAGE,
155
+ license=_LICENSE,
156
+ citation=_CITATION,
157
+ )
158
+
159
+ def _split_generators(self, dl_manager):
160
+ urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
161
+ data_dirs = dl_manager.download_and_extract(urls)
162
+ return [
163
+ datasets.SplitGenerator(
164
+ name=datasets.Split.TRAIN,
165
+ # These kwargs will be passed to _generate_examples
166
+ gen_kwargs={
167
+ "filepath": data_dirs["train"],
168
+ "split": datasets.Split.TRAIN,
169
+ },
170
+ ),
171
+ datasets.SplitGenerator(
172
+ name=datasets.Split.VALIDATION,
173
+ # These kwargs will be passed to _generate_examples
174
+ gen_kwargs={
175
+ "filepath": data_dirs["validation"],
176
+ "split": datasets.Split.VALIDATION,
177
+ },
178
+ ),
179
+ ]
180
+
181
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
182
+ def _generate_examples(self, filepath, split):
183
+ with open(filepath, encoding="utf-8") as f:
184
+ data = json.load(f)
185
+ for row in data["data"]:
186
+ id = row["id"]
187
+ source = row["source"]
188
+ story = row["story"]
189
+ questions = [
190
+ {"input_text": q["input_text"], "turn_id": q["turn_id"]}
191
+ for q in row["questions"]
192
+ ]
193
+ answers = [
194
+ {
195
+ "span_start": a["span_start"],
196
+ "span_end": a["span_end"],
197
+ "span_text": a["span_text"],
198
+ "input_text": a["input_text"],
199
+ "turn_id": a["turn_id"],
200
+ }
201
+ for a in row["answers"]
202
+ ]
203
+ if split == datasets.Split.TRAIN:
204
+ additional_answers = _EMPTY_ADDITIONAL_ANSWER
205
+ else:
206
+ additional_answers = {
207
+ "0": [
208
+ {
209
+ "span_start": a0["span_start"],
210
+ "span_end": a0["span_end"],
211
+ "span_text": a0["span_text"],
212
+ "input_text": a0["input_text"],
213
+ "turn_id": a0["turn_id"],
214
+ }
215
+ for a0 in row["additional_answers"]["0"]
216
+ ],
217
+ "1": [
218
+ {
219
+ "span_start": a1["span_start"],
220
+ "span_end": a1["span_end"],
221
+ "span_text": a1["span_text"],
222
+ "input_text": a1["input_text"],
223
+ "turn_id": a1["turn_id"],
224
+ }
225
+ for a1 in row["additional_answers"]["1"]
226
+ ],
227
+ "2": [
228
+ {
229
+ "span_start": a2["span_start"],
230
+ "span_end": a2["span_end"],
231
+ "span_text": a2["span_text"],
232
+ "input_text": a2["input_text"],
233
+ "turn_id": a2["turn_id"],
234
+ }
235
+ for a2 in row["additional_answers"]["2"]
236
+ ],
237
+ }
238
+ yield row["id"], {
239
+ "id": id,
240
+ "story": story,
241
+ "source": source,
242
+ "questions": questions,
243
+ "answers": answers,
244
+ "additional_answers": additional_answers,
245
+ }
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/coqa/dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n title={CoQA: A Conversational Question Answering Challenge},\n author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n year={2018},\n eprint={1808.07042},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (177 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/__pycache__/headqa.cpython-310.pyc ADDED
Binary file (4.69 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"es": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n author = \"Vilares, David and\n G{'o}mez-Rodr{'i}guez, Carlos\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1092\",\n doi = \"10.18653/v1/P19-1092\",\n pages = \"960--966\",\n abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "es", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1196021, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1169819, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 556924, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2922764, "size_in_bytes": 82288266}, "en": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n author = \"Vilares, David and\n G{'o}mez-Rodr{'i}guez, Carlos\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1092\",\n doi = \"10.18653/v1/P19-1092\",\n pages = \"960--966\",\n abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "en", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1123151, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1097349, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 523462, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2743962, "size_in_bytes": 82109464}}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/headqa/headqa.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+ # NOTE: This is an exact copy of
16
+ # https://github.com/huggingface/datasets/blob/3804442bb7cfcb9d52044d92688115cfdc69c2da/datasets/head_qa/head_qa.py
17
+ # with the exception of the `image` feature. This is to avoid adding `Pillow`
18
+ # as a dependency.
19
+ """HEAD-QA: A Healthcare Dataset for Complex Reasoning."""
20
+
21
+
22
+ import json
23
+ import os
24
+
25
+ import datasets
26
+
27
+
28
+ _CITATION = """\
29
+ @inproceedings{vilares-gomez-rodriguez-2019-head,
30
+ title = "{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning",
31
+ author = "Vilares, David and
32
+ G{\'o}mez-Rodr{\'i}guez, Carlos",
33
+ booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
34
+ month = jul,
35
+ year = "2019",
36
+ address = "Florence, Italy",
37
+ publisher = "Association for Computational Linguistics",
38
+ url = "https://www.aclweb.org/anthology/P19-1092",
39
+ doi = "10.18653/v1/P19-1092",
40
+ pages = "960--966",
41
+ abstract = "We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.",
42
+ }
43
+ """
44
+
45
+ _DESCRIPTION = """\
46
+ HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the
47
+ Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio
48
+ de Sanidad, Consumo y Bienestar Social.
49
+ The dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.
50
+ """
51
+
52
+ _HOMEPAGE = "https://aghie.github.io/head-qa/"
53
+
54
+ _LICENSE = "MIT License"
55
+
56
+ _URL = "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t"
57
+
58
+ _DIRS = {"es": "HEAD", "en": "HEAD_EN"}
59
+
60
+
61
+ class HeadQA(datasets.GeneratorBasedBuilder):
62
+ """HEAD-QA: A Healthcare Dataset for Complex Reasoning"""
63
+
64
+ VERSION = datasets.Version("1.1.0")
65
+
66
+ BUILDER_CONFIGS = [
67
+ datasets.BuilderConfig(
68
+ name="es", version=VERSION, description="Spanish HEAD dataset"
69
+ ),
70
+ datasets.BuilderConfig(
71
+ name="en", version=VERSION, description="English HEAD dataset"
72
+ ),
73
+ ]
74
+
75
+ DEFAULT_CONFIG_NAME = "es"
76
+
77
+ def _info(self):
78
+ return datasets.DatasetInfo(
79
+ description=_DESCRIPTION,
80
+ features=datasets.Features(
81
+ {
82
+ "name": datasets.Value("string"),
83
+ "year": datasets.Value("string"),
84
+ "category": datasets.Value("string"),
85
+ "qid": datasets.Value("int32"),
86
+ "qtext": datasets.Value("string"),
87
+ "ra": datasets.Value("int32"),
88
+ "answers": [
89
+ {
90
+ "aid": datasets.Value("int32"),
91
+ "atext": datasets.Value("string"),
92
+ }
93
+ ],
94
+ }
95
+ ),
96
+ supervised_keys=None,
97
+ homepage=_HOMEPAGE,
98
+ license=_LICENSE,
99
+ citation=_CITATION,
100
+ )
101
+
102
+ def _split_generators(self, dl_manager):
103
+ """Returns SplitGenerators."""
104
+ data_dir = dl_manager.download_and_extract(_URL)
105
+
106
+ dir = _DIRS[self.config.name]
107
+ data_lang_dir = os.path.join(data_dir, dir)
108
+
109
+ return [
110
+ datasets.SplitGenerator(
111
+ name=datasets.Split.TRAIN,
112
+ gen_kwargs={
113
+ "data_dir": data_dir,
114
+ "filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
115
+ },
116
+ ),
117
+ datasets.SplitGenerator(
118
+ name=datasets.Split.TEST,
119
+ gen_kwargs={
120
+ "data_dir": data_dir,
121
+ "filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
122
+ },
123
+ ),
124
+ datasets.SplitGenerator(
125
+ name=datasets.Split.VALIDATION,
126
+ gen_kwargs={
127
+ "data_dir": data_dir,
128
+ "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
129
+ },
130
+ ),
131
+ ]
132
+
133
+ def _generate_examples(self, data_dir, filepath):
134
+ """Yields examples."""
135
+ with open(filepath, encoding="utf-8") as f:
136
+ head_qa = json.load(f)
137
+ for exam_id, exam in enumerate(head_qa["exams"]):
138
+ content = head_qa["exams"][exam]
139
+ name = content["name"].strip()
140
+ year = content["year"].strip()
141
+ category = content["category"].strip()
142
+ for question in content["data"]:
143
+ qid = int(question["qid"].strip())
144
+ qtext = question["qtext"].strip()
145
+ ra = int(question["ra"].strip())
146
+
147
+ aids = [answer["aid"] for answer in question["answers"]]
148
+ atexts = [answer["atext"].strip() for answer in question["answers"]]
149
+ answers = [
150
+ {"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)
151
+ ]
152
+
153
+ id_ = f"{exam_id}_{qid}"
154
+ yield id_, {
155
+ "name": name,
156
+ "year": year,
157
+ "category": category,
158
+ "qid": qid,
159
+ "qtext": qtext,
160
+ "ra": ra,
161
+ "answers": answers,
162
+ }
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (187 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/__pycache__/hendrycks_ethics.cpython-310.pyc ADDED
Binary file (5.19 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"commonsense": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"label": {"dtype": "int32", "id": null, "_type": "Value"}, "input": {"dtype": "string", "id": null, "_type": "Value"}, "is_short": {"dtype": "bool", "id": null, "_type": "Value"}, "edited": {"dtype": "bool", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "commonsense", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 14435215, "num_examples": 13910, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 3150094, "num_examples": 3885, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 17585309, "size_in_bytes": 53170333}, "deontology": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "excuse": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "deontology", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1931475, "num_examples": 18164, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 384602, "num_examples": 3596, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2316077, "size_in_bytes": 37901101}, "justice": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Justice subset contains examples focusing on how a character treats another person", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "justice", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2516501, "num_examples": 21791, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 309427, "num_examples": 2704, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2825928, "size_in_bytes": 38410952}, "utilitarianism": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"activity": {"dtype": "string", "id": null, "_type": "Value"}, "baseline": {"dtype": "string", "id": null, "_type": "Value"}, "rating": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "utilitarianism", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2241770, "num_examples": 13738, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 749768, "num_examples": 4808, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 2991538, "size_in_bytes": 38576562}, "virtue": {"description": "The ETHICS dataset is a benchmark that spans concepts in justice, well-being,\nduties, virtues, and commonsense morality. Models predict widespread moral\njudgments about diverse text scenarios. This requires connecting physical and\nsocial world knowledge to value judgements, a capability that may enable us\nto steer chatbot outputs or eventually regularize open-ended reinforcement\nlearning agents.\n\nThe Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified", "citation": "@article{hendrycks2021ethics\n title={Aligning AI With Shared Human Values},\n author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},\n journal={Proceedings of the International Conference on Learning Representations (ICLR)},\n year={2021}\n}\n", "homepage": "https://github.com/hendrycks/ethics", "license": "", "features": {"group_id": {"dtype": "int32", "id": null, "_type": "Value"}, "label": {"dtype": "int32", "id": null, "_type": "Value"}, "scenario": {"dtype": "string", "id": null, "_type": "Value"}, "trait": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "hendrycks_ethics", "config_name": "virtue", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 2640328, "num_examples": 28245, "dataset_name": "hendrycks_ethics"}, "test": {"name": "test", "num_bytes": 473473, "num_examples": 4975, "dataset_name": "hendrycks_ethics"}}, "download_checksums": {"https://people.eecs.berkeley.edu/~hendrycks/ethics.tar": {"num_bytes": 35585024, "checksum": "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333"}}, "download_size": 35585024, "post_processing_size": null, "dataset_size": 3113801, "size_in_bytes": 38698825}}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_ethics/hendrycks_ethics.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """ETHICS dataset."""
15
+ # TODO: Add the `hard` dataset splits.
16
+
17
+
18
+ import csv
19
+ import os
20
+
21
+ import datasets
22
+
23
+
24
+ _CITATION = """\
25
+ @article{hendrycks2021ethics
26
+ title={Aligning AI With Shared Human Values},
27
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andrew Critch and Jerry Li and Dawn Song and Jacob Steinhardt},
28
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
29
+ year={2021}
30
+ }
31
+ """
32
+
33
+ _DESCRIPTION = """\
34
+ The ETHICS dataset is a benchmark that spans concepts in justice, well-being,
35
+ duties, virtues, and commonsense morality. Models predict widespread moral
36
+ judgments about diverse text scenarios. This requires connecting physical and
37
+ social world knowledge to value judgements, a capability that may enable us
38
+ to steer chatbot outputs or eventually regularize open-ended reinforcement
39
+ learning agents.
40
+ """
41
+
42
+ _HOMEPAGE = "https://github.com/hendrycks/ethics"
43
+
44
+ # TODO: Add the licence for the dataset here if you can find it
45
+ _LICENSE = ""
46
+
47
+ _URLS = "https://people.eecs.berkeley.edu/~hendrycks/ethics.tar"
48
+
49
+
50
+ class EthicsConfig(datasets.BuilderConfig):
51
+ """BuilderConfig for Hendrycks ETHICS."""
52
+
53
+ def __init__(self, prefix, features, **kwargs):
54
+ """BuilderConfig for Hendrycks ETHICS.
55
+
56
+ Args:
57
+ prefix: *string*, prefix to add to the dataset name for path location.
58
+ features: *list[string]*, list of the features that will appear in the
59
+ feature dict.
60
+ """
61
+ # Version history:
62
+ super().__init__(version=datasets.Version("0.0.1"), **kwargs)
63
+ self.prefix = prefix
64
+ self.features = features
65
+
66
+
67
+ class HendrycksEthics(datasets.GeneratorBasedBuilder):
68
+ """The ETHICS dataset is a benchmark that spans concepts in justice, well-being, duties, virtues, and commonsense morality."""
69
+
70
+ BUILDER_CONFIGS = [
71
+ EthicsConfig(
72
+ name="commonsense",
73
+ prefix="cm",
74
+ features=datasets.Features(
75
+ {
76
+ "label": datasets.Value("int32"),
77
+ "input": datasets.Value("string"),
78
+ "is_short": datasets.Value("bool"),
79
+ "edited": datasets.Value("bool"),
80
+ }
81
+ ),
82
+ description="The Commonsense subset contains examples focusing on moral standards and principles that most people intuitively accept.",
83
+ ),
84
+ EthicsConfig(
85
+ name="deontology",
86
+ prefix="deontology",
87
+ features=datasets.Features(
88
+ {
89
+ "group_id": datasets.Value("int32"),
90
+ "label": datasets.Value("int32"),
91
+ "scenario": datasets.Value("string"),
92
+ "excuse": datasets.Value("string"),
93
+ }
94
+ ),
95
+ description="The Deontology subset contains examples focusing on whether an act is required, permitted, or forbidden according to a set of rules or constraints",
96
+ ),
97
+ EthicsConfig(
98
+ name="justice",
99
+ prefix="justice",
100
+ features=datasets.Features(
101
+ {
102
+ "group_id": datasets.Value("int32"),
103
+ "label": datasets.Value("int32"),
104
+ "scenario": datasets.Value("string"),
105
+ }
106
+ ),
107
+ description="The Justice subset contains examples focusing on how a character treats another person",
108
+ ),
109
+ EthicsConfig(
110
+ name="utilitarianism",
111
+ prefix="util",
112
+ features=datasets.Features(
113
+ {
114
+ "activity": datasets.Value("string"),
115
+ "baseline": datasets.Value("string"),
116
+ "rating": datasets.Value("string"), # Empty rating.
117
+ }
118
+ ),
119
+ description="The Utilitarianism subset contains scenarios that should be ranked from most pleasant to least pleasant for the person in the scenario",
120
+ ),
121
+ EthicsConfig(
122
+ name="virtue",
123
+ prefix="virtue",
124
+ features=datasets.Features(
125
+ {
126
+ "group_id": datasets.Value("int32"),
127
+ "label": datasets.Value("int32"),
128
+ "scenario": datasets.Value("string"),
129
+ "trait": datasets.Value("string"),
130
+ }
131
+ ),
132
+ description="The Virtue subset contains scenarios focusing on whether virtues or vices are being exemplified",
133
+ ),
134
+ ]
135
+
136
+ def _info(self):
137
+ return datasets.DatasetInfo(
138
+ description=f"{_DESCRIPTION}\n{self.config.description}",
139
+ features=self.config.features,
140
+ homepage=_HOMEPAGE,
141
+ license=_LICENSE,
142
+ citation=_CITATION,
143
+ )
144
+
145
+ def _split_generators(self, dl_manager):
146
+ urls = _URLS
147
+ data_dir = dl_manager.download_and_extract(urls)
148
+ return [
149
+ datasets.SplitGenerator(
150
+ name=datasets.Split.TRAIN,
151
+ # These kwargs will be passed to _generate_examples
152
+ gen_kwargs={
153
+ "filepath": os.path.join(
154
+ data_dir,
155
+ "ethics",
156
+ self.config.name,
157
+ f"{self.config.prefix}_train.csv",
158
+ ),
159
+ "split": "train",
160
+ },
161
+ ),
162
+ datasets.SplitGenerator(
163
+ name=datasets.Split.TEST,
164
+ # These kwargs will be passed to _generate_examples
165
+ gen_kwargs={
166
+ "filepath": os.path.join(
167
+ data_dir,
168
+ "ethics",
169
+ self.config.name,
170
+ f"{self.config.prefix}_test.csv",
171
+ ),
172
+ "split": "test",
173
+ },
174
+ ),
175
+ ]
176
+
177
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
178
+ def _generate_examples(self, filepath, split):
179
+ with open(filepath, newline="") as f:
180
+ if self.config.name == "utilitarianism":
181
+ contents = csv.DictReader(f, fieldnames=["activity", "baseline"])
182
+ else:
183
+ contents = csv.DictReader(f)
184
+ # For subsets with grouped scenarios, tag them with an id.
185
+ group_id = 0
186
+ for key, row in enumerate(contents):
187
+ if self.config.name == "deontology":
188
+ # Scenarios come in groups of 4.
189
+ if key % 4 == 0 and key != 0:
190
+ group_id += 1
191
+ yield key, {
192
+ "group_id": group_id,
193
+ "label": row["label"],
194
+ "scenario": row["scenario"],
195
+ "excuse": row["excuse"],
196
+ }
197
+ elif self.config.name == "justice":
198
+ # Scenarios come in groups of 4.
199
+ if key % 4 == 0 and key != 0:
200
+ group_id += 1
201
+ yield key, {
202
+ "group_id": group_id,
203
+ "label": row["label"],
204
+ "scenario": row["scenario"],
205
+ }
206
+ elif self.config.name == "commonsense":
207
+ yield key, {
208
+ "label": row["label"],
209
+ "input": row["input"],
210
+ "is_short": row["is_short"],
211
+ "edited": row["edited"],
212
+ }
213
+ elif self.config.name == "virtue":
214
+ # Scenarios come in groups of 5.
215
+ if key % 5 == 0 and key != 0:
216
+ group_id += 1
217
+ scenario, trait = row["scenario"].split(" [SEP] ")
218
+ yield key, {
219
+ "group_id": group_id,
220
+ "label": row["label"],
221
+ "scenario": scenario,
222
+ "trait": trait,
223
+ }
224
+ elif self.config.name == "utilitarianism":
225
+ yield key, {
226
+ "activity": row["activity"],
227
+ "baseline": row["baseline"],
228
+ "rating": "",
229
+ }
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/hendrycks_math/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (185 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (181 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/__pycache__/lambada_ja.cpython-310.pyc ADDED
Binary file (3.98 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/lambada_ja/lambada_ja.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # TODO: Address all TODOs and remove all explanatory comments
15
+ """LAMBADA (OpenAI) dataset."""
16
+
17
+ import os
18
+ import json
19
+
20
+ import datasets
21
+
22
+
23
+ _CITATION = """\
24
+ @misc{
25
+ author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
26
+ title={The LAMBADA dataset},
27
+ DOI={10.5281/zenodo.2630551},
28
+ publisher={Zenodo},
29
+ year={2016},
30
+ month={Aug}
31
+ }
32
+ """
33
+
34
+ _DESCRIPTION = """\
35
+ The LAMBADA dataset as processed by OpenAI. It is used to evaluate the capabilities
36
+ of computational models for text understanding by means of a word prediction task.
37
+ LAMBADA is a collection of narrative texts sharing the characteristic that human subjects
38
+ are able to guess their last word if they are exposed to the whole text, but not
39
+ if they only see the last sentence preceding the target word. To succeed on LAMBADA,
40
+ computational models cannot simply rely on local context, but must be able to keep track
41
+ of information in the broader discourse.
42
+
43
+ Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
44
+ """
45
+
46
+ _HOMEPAGE = "https://zenodo.org/record/2630551#.X4Xzn5NKjUI"
47
+
48
+ # TODO: Add the licence for the dataset here if you can find it
49
+ _LICENSE = "Modified MIT"
50
+
51
+ _BASE_URL = (
52
+ "https://huggingface.co/datasets/EleutherAI/lambada_openai/resolve/main/data"
53
+ )
54
+
55
+ JA_PATH = os.path.join(
56
+ os.path.dirname(os.path.abspath(__file__)), "lambada_test_ja.jsonl"
57
+ )
58
+ _URLS = {
59
+ "default": f"{_BASE_URL}/lambada_test.jsonl",
60
+ "de": f"{_BASE_URL}/lambada_test_de.jsonl",
61
+ "en": f"{_BASE_URL}/lambada_test_en.jsonl",
62
+ "es": f"{_BASE_URL}/lambada_test_es.jsonl",
63
+ "fr": f"{_BASE_URL}/lambada_test_fr.jsonl",
64
+ "it": f"{_BASE_URL}/lambada_test_it.jsonl",
65
+ "ja": "https://gist.githubusercontent.com/mkshing/22b4623233940b2baa2f924e60f9b287/raw/c2c58325f5bc599818fe5f7d6f6b9af3e7699ed6/lambada_test_ja.jsonl",
66
+ }
67
+
68
+
69
+ class LambadaOpenAI(datasets.GeneratorBasedBuilder):
70
+ """LAMBADA is a dataset to evaluate the capabilities of computational models for text understanding by means of a word prediction task."""
71
+
72
+ VERSION = datasets.Version("1.0.0")
73
+
74
+ BUILDER_CONFIGS = [
75
+ datasets.BuilderConfig(
76
+ name="default",
77
+ version=VERSION,
78
+ description="Pre-processed English LAMBADA dataset from OpenAI",
79
+ ),
80
+ datasets.BuilderConfig(
81
+ name="de",
82
+ version=VERSION,
83
+ description="The German translated LAMBADA OpenAI dataset",
84
+ ),
85
+ datasets.BuilderConfig(
86
+ name="en",
87
+ version=VERSION,
88
+ description="The English translated LAMBADA OpenAI dataset",
89
+ ),
90
+ datasets.BuilderConfig(
91
+ name="es",
92
+ version=VERSION,
93
+ description="The Spanish translated LAMBADA OpenAI dataset",
94
+ ),
95
+ datasets.BuilderConfig(
96
+ name="fr",
97
+ version=VERSION,
98
+ description="The French translated LAMBADA OpenAI dataset",
99
+ ),
100
+ datasets.BuilderConfig(
101
+ name="it",
102
+ version=VERSION,
103
+ description="The Italian translated LAMBADA OpenAI dataset",
104
+ ),
105
+ datasets.BuilderConfig(
106
+ name="ja",
107
+ version=VERSION,
108
+ description="The Japanese translated LAMBADA OpenAI dataset",
109
+ ),
110
+ ]
111
+
112
+ DEFAULT_CONFIG_NAME = "default"
113
+
114
+ def _info(self):
115
+ features = datasets.Features(
116
+ {
117
+ "text": datasets.Value("string"),
118
+ }
119
+ )
120
+ return datasets.DatasetInfo(
121
+ description=f"{_DESCRIPTION}\n{self.config.description}",
122
+ features=features,
123
+ homepage=_HOMEPAGE,
124
+ license=_LICENSE,
125
+ citation=_CITATION,
126
+ )
127
+
128
+ def _split_generators(self, dl_manager):
129
+ urls = _URLS[self.config.name]
130
+ data_dir = dl_manager.download_and_extract(urls)
131
+ return [
132
+ datasets.SplitGenerator(
133
+ name=datasets.Split.TEST,
134
+ # These kwargs will be passed to _generate_examples
135
+ gen_kwargs={
136
+ "filepath": data_dir,
137
+ "split": "test",
138
+ },
139
+ ),
140
+ ]
141
+
142
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
143
+ def _generate_examples(self, filepath, split):
144
+ with open(filepath, encoding="utf-8") as f:
145
+ for key, row in enumerate(f):
146
+ data = json.loads(row)
147
+ yield key, {"text": data["text"]}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (177 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/__pycache__/logiqa.cpython-310.pyc ADDED
Binary file (3.34 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/logiqa/logiqa.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """LogiQA dataset."""
15
+
16
+
17
+ import datasets
18
+
19
+
20
+ _CITATION = """\
21
+ @misc{liu2020logiqa,
22
+ title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
23
+ author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
24
+ year={2020},
25
+ eprint={2007.08124},
26
+ archivePrefix={arXiv},
27
+ primaryClass={cs.CL}
28
+ }
29
+ """
30
+
31
+ _DESCRIPTION = """\
32
+ LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA
33
+ instances, covering multiple types of deductive reasoning. Results show that state-
34
+ of-the-art neural models perform by far worse than human ceiling. The dataset can
35
+ also serve as a benchmark for reinvestigating logical AI under the deep learning
36
+ NLP setting.
37
+ """
38
+
39
+ _HOMEPAGE = "https://github.com/lgw863/LogiQA-dataset"
40
+
41
+ # TODO: Add the licence for the dataset here if you can find it
42
+ _LICENSE = ""
43
+
44
+ _URLS = {
45
+ "train": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt",
46
+ "validation": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt",
47
+ "test": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt",
48
+ }
49
+
50
+
51
+ class Logiqa(datasets.GeneratorBasedBuilder):
52
+ """LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning"""
53
+
54
+ VERSION = datasets.Version("0.0.1")
55
+
56
+ BUILDER_CONFIGS = [
57
+ datasets.BuilderConfig(
58
+ name="logiqa", version=VERSION, description="The LogiQA dataset."
59
+ ),
60
+ ]
61
+
62
+ def _info(self):
63
+ features = datasets.Features(
64
+ {
65
+ "label": datasets.Value("string"),
66
+ "context": datasets.Value("string"),
67
+ "question": datasets.Value("string"),
68
+ "options": datasets.features.Sequence(datasets.Value("string")),
69
+ }
70
+ )
71
+ return datasets.DatasetInfo(
72
+ description=_DESCRIPTION,
73
+ features=features,
74
+ homepage=_HOMEPAGE,
75
+ license=_LICENSE,
76
+ citation=_CITATION,
77
+ )
78
+
79
+ def _split_generators(self, dl_manager):
80
+ urls = {
81
+ "train": _URLS["train"],
82
+ "test": _URLS["test"],
83
+ "validation": _URLS["validation"],
84
+ }
85
+ data_dir = dl_manager.download_and_extract(urls)
86
+ return [
87
+ datasets.SplitGenerator(
88
+ name=datasets.Split.TRAIN,
89
+ # These kwargs will be passed to _generate_examples
90
+ gen_kwargs={
91
+ "filepath": data_dir["train"],
92
+ "split": "train",
93
+ },
94
+ ),
95
+ datasets.SplitGenerator(
96
+ name=datasets.Split.TEST,
97
+ # These kwargs will be passed to _generate_examples
98
+ gen_kwargs={"filepath": data_dir["test"], "split": "test"},
99
+ ),
100
+ datasets.SplitGenerator(
101
+ name=datasets.Split.VALIDATION,
102
+ # These kwargs will be passed to _generate_examples
103
+ gen_kwargs={
104
+ "filepath": data_dir["validation"],
105
+ "split": "validation",
106
+ },
107
+ ),
108
+ ]
109
+
110
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
111
+ def _generate_examples(self, filepath, split):
112
+ def normalize(text):
113
+ return text.replace(".", ". ").strip()
114
+
115
+ with open(filepath, encoding="utf-8") as f:
116
+ data = f.read().strip().split("\n\n")
117
+ for key, row in enumerate(data):
118
+ example = row.split("\n")
119
+ yield key, {
120
+ "label": example[0].strip(),
121
+ "context": normalize(example[1]),
122
+ "question": normalize(example[2]),
123
+ "options": [normalize(option[2:]) for option in example[3:]],
124
+ }
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (177 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/__pycache__/mutual.cpython-310.pyc ADDED
Binary file (2.96 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/mutual/mutual.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """MuTual dataset."""
15
+
16
+
17
+ import json
18
+ import os
19
+ from pathlib import Path
20
+
21
+ import datasets
22
+
23
+
24
+ _CITATION = """\
25
+ @inproceedings{mutual,
26
+ title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
27
+ author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
28
+ booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
29
+ year = "2020",
30
+ publisher = "Association for Computational Linguistics",
31
+ }
32
+ """
33
+
34
+ _DESCRIPTION = """\
35
+ MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
36
+ modified from Chinese high school English listening comprehension test data.
37
+ """
38
+
39
+ _HOMEPAGE = "https://github.com/Nealcly/MuTual"
40
+
41
+ # TODO: Add the licence for the dataset here if you can find it
42
+ _LICENSE = ""
43
+
44
+ _URLS = "https://github.com/Nealcly/MuTual/archive/master.zip"
45
+
46
+
47
+ class Mutual(datasets.GeneratorBasedBuilder):
48
+ """MuTual: A Dataset for Multi-Turn Dialogue Reasoning"""
49
+
50
+ VERSION = datasets.Version("0.0.1")
51
+
52
+ BUILDER_CONFIGS = [
53
+ datasets.BuilderConfig(
54
+ name="mutual", version=VERSION, description="The MuTual dataset."
55
+ ),
56
+ datasets.BuilderConfig(
57
+ name="mutual_plus",
58
+ version=VERSION,
59
+ description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
60
+ ),
61
+ ]
62
+
63
+ def _info(self):
64
+ features = datasets.Features(
65
+ {
66
+ "answers": datasets.Value("string"),
67
+ "options": datasets.features.Sequence(datasets.Value("string")),
68
+ "article": datasets.Value("string"),
69
+ "id": datasets.Value("string"),
70
+ }
71
+ )
72
+ return datasets.DatasetInfo(
73
+ description=f"{_DESCRIPTION}\n{self.config.description}",
74
+ features=features,
75
+ homepage=_HOMEPAGE,
76
+ license=_LICENSE,
77
+ citation=_CITATION,
78
+ )
79
+
80
+ def _split_generators(self, dl_manager):
81
+ urls = _URLS
82
+ data_dir = dl_manager.download_and_extract(urls)
83
+ return [
84
+ datasets.SplitGenerator(
85
+ name=datasets.Split.TRAIN,
86
+ # These kwargs will be passed to _generate_examples
87
+ gen_kwargs={
88
+ "basepath": os.path.join(
89
+ data_dir, "MuTual-master", "data", self.config.name, "train"
90
+ ),
91
+ "split": "train",
92
+ },
93
+ ),
94
+ datasets.SplitGenerator(
95
+ name=datasets.Split.TEST,
96
+ # These kwargs will be passed to _generate_examples
97
+ gen_kwargs={
98
+ "basepath": os.path.join(
99
+ data_dir, "MuTual-master", "data", self.config.name, "test"
100
+ ),
101
+ "split": "test",
102
+ },
103
+ ),
104
+ datasets.SplitGenerator(
105
+ name=datasets.Split.VALIDATION,
106
+ # These kwargs will be passed to _generate_examples
107
+ gen_kwargs={
108
+ "basepath": os.path.join(
109
+ data_dir, "MuTual-master", "data", self.config.name, "dev"
110
+ ),
111
+ "split": "dev",
112
+ },
113
+ ),
114
+ ]
115
+
116
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
117
+ def _generate_examples(self, basepath, split):
118
+ # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
119
+ # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
120
+ key = 0
121
+ for file in sorted(Path(basepath).iterdir()):
122
+ if file.suffix != ".txt":
123
+ continue
124
+ with open(file, "r", encoding="utf-8") as f:
125
+ data_str = f.read()
126
+ # Ignore the occasional empty file.
127
+ if not data_str:
128
+ continue
129
+ data = json.loads(data_str)
130
+ yield key, {
131
+ "answers": data["answers"],
132
+ "options": data["options"],
133
+ "article": data["article"],
134
+ "id": data["id"],
135
+ }
136
+ key += 1
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (175 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/__pycache__/pile.cpython-310.pyc ADDED
Binary file (3.78 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"pile_arxiv": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_arxiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 113218251, "num_examples": 2407, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 115653720, "num_examples": 2434, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 228871971, "size_in_bytes": 1160030307}, "pile_books3": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_books3", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 150095743, "num_examples": 269, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 177359876, "num_examples": 301, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 327455619, "size_in_bytes": 1258613955}, "pile_bookcorpus2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_bookcorpus2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 9680652, "num_examples": 28, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9776271, "num_examples": 26, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 19456923, "size_in_bytes": 950615259}, "pile_dm-mathematics": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_dm-mathematics", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 15756556, "num_examples": 1922, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 16453386, "num_examples": 2007, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 32209942, "size_in_bytes": 963368278}, "pile_enron": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_enron", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 1638859, "num_examples": 1010, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 1556487, "num_examples": 947, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 3195346, "size_in_bytes": 934353682}, "pile_europarl": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_europarl", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8789652, "num_examples": 157, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9111791, "num_examples": 133, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17901443, "size_in_bytes": 949059779}, "pile_freelaw": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_freelaw", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 80808693, "num_examples": 5101, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 80363814, "num_examples": 5094, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 161172507, "size_in_bytes": 1092330843}, "pile_github": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_github", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 95654706, "num_examples": 18195, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 97179576, "num_examples": 18337, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 192834282, "size_in_bytes": 1123992618}, "pile_gutenberg": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_gutenberg", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 30243176, "num_examples": 80, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 24685980, "num_examples": 60, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 54929156, "size_in_bytes": 986087492}, "pile_hackernews": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_hackernews", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8124255, "num_examples": 1632, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9803822, "num_examples": 1619, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17928077, "size_in_bytes": 949086413}, "pile_nih-exporter": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_nih-exporter", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 3928804, "num_examples": 1884, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 3927967, "num_examples": 1825, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 7856771, "size_in_bytes": 939015107}, "pile_opensubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_opensubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 21008996, "num_examples": 642, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 19622904, "num_examples": 621, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 40631900, "size_in_bytes": 971790236}, "pile_openwebtext2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_openwebtext2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 128624303, "num_examples": 32925, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 131554302, "num_examples": 33400, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 260178605, "size_in_bytes": 1191336941}, "pile_philpapers": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_philpapers", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5090158, "num_examples": 68, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 6499078, "num_examples": 64, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 11589236, "size_in_bytes": 942747572}, "pile_pile-cc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pile-cc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 235004043, "num_examples": 52790, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 233535650, "num_examples": 52792, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 468539693, "size_in_bytes": 1399698029}, "pile_pubmed-abstracts": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-abstracts", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 39908950, "num_examples": 29895, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 40008336, "num_examples": 29871, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 79917286, "size_in_bytes": 1011075622}, "pile_pubmed-central": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-central", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 187251519, "num_examples": 5911, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 184791818, "num_examples": 5977, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 372043337, "size_in_bytes": 1303201673}, "pile_stackexchange": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_stackexchange", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 66441557, "num_examples": 30378, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 66011397, "num_examples": 29950, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 132452954, "size_in_bytes": 1063611290}, "pile_upsto": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_upsto", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 47345405, "num_examples": 11415, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 48122320, "num_examples": 11387, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 95467725, "size_in_bytes": 1026626061}, "pile_ubuntu-irc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_ubuntu-irc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5694218, "num_examples": 22, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 7410104, "num_examples": 21, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 13104322, "size_in_bytes": 944262658}, "pile_wikipedia": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_wikipedia", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 52166968, "num_examples": 17511, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 53186137, "num_examples": 17478, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 105353105, "size_in_bytes": 1036511441}, "pile_youtubesubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles", "citation": "@article{pile,\n title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n journal={arXiv preprint arXiv:2101.00027},\n year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_youtubesubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 7377448, "num_examples": 342, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 8937546, "num_examples": 326, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 16314994, "size_in_bytes": 947473330}}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/pile/pile.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Pile dataset."""
15
+
16
+
17
+ import json
18
+
19
+ import datasets
20
+
21
+
22
+ _CITATION = """\
23
+ @article{pile,
24
+ title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
25
+ author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
26
+ journal={arXiv preprint arXiv:2101.00027},
27
+ year={2020}
28
+ }
29
+ """
30
+
31
+ _DESCRIPTION = """\
32
+ The Pile is a 825 GiB diverse, open source language modeling data set that consists
33
+ of 22 smaller, high-quality datasets combined together. To score well on Pile
34
+ BPB (bits per byte), a model must be able to understand many disparate domains
35
+ including books, github repositories, webpages, chat logs, and medical, physics,
36
+ math, computer science, and philosophy papers.
37
+ """
38
+
39
+ _HOMEPAGE = "https://pile.eleuther.ai/"
40
+
41
+ # TODO: Add the licence for the dataset here if you can find it
42
+ _LICENSE = ""
43
+
44
+ _URLS = {
45
+ "validation": "https://the-eye.eu/public/AI/pile/val.jsonl.zst",
46
+ "test": "https://the-eye.eu/public/AI/pile/test.jsonl.zst",
47
+ }
48
+
49
+ _NAMES = {
50
+ "pile_arxiv": "ArXiv",
51
+ "pile_books3": "Books3",
52
+ "pile_bookcorpus2": "BookCorpus2",
53
+ "pile_dm-mathematics": "DM Mathematics",
54
+ "pile_enron": "Enron Emails",
55
+ "pile_europarl": "EuroParl",
56
+ "pile_freelaw": "FreeLaw",
57
+ "pile_github": "Github",
58
+ "pile_gutenberg": "Gutenberg (PG-19)",
59
+ "pile_hackernews": "HackerNews",
60
+ "pile_nih-exporter": "NIH ExPorter",
61
+ "pile_opensubtitles": "OpenSubtitles",
62
+ "pile_openwebtext2": "OpenWebText2",
63
+ "pile_philpapers": "PhilPapers",
64
+ "pile_pile-cc": "Pile-CC",
65
+ "pile_pubmed-abstracts": "PubMed Abstracts",
66
+ "pile_pubmed-central": "PubMed Central",
67
+ "pile_stackexchange": "StackExchange",
68
+ "pile_upsto": "USPTO Backgrounds",
69
+ "pile_ubuntu-irc": "Ubuntu IRC",
70
+ "pile_wikipedia": "Wikipedia (en)",
71
+ "pile_youtubesubtitles": "YoutubeSubtitles",
72
+ }
73
+
74
+
75
+ class Pile(datasets.GeneratorBasedBuilder):
76
+ """The Pile is a 825 GiB diverse, open source language modeling dataset."""
77
+
78
+ VERSION = datasets.Version("0.0.1")
79
+
80
+ BUILDER_CONFIGS = [
81
+ datasets.BuilderConfig(name=name, version=version, description=_NAMES[name])
82
+ for name, version in zip(_NAMES.keys(), [VERSION] * len(_NAMES))
83
+ ]
84
+
85
+ def _info(self):
86
+ features = datasets.Features(
87
+ {
88
+ "text": datasets.Value("string"),
89
+ }
90
+ )
91
+ return datasets.DatasetInfo(
92
+ description=f"{_DESCRIPTION}\n{self.config.description}",
93
+ features=features,
94
+ homepage=_HOMEPAGE,
95
+ license=_LICENSE,
96
+ citation=_CITATION,
97
+ )
98
+
99
+ def _split_generators(self, dl_manager):
100
+ urls = {"validation": _URLS["validation"], "test": _URLS["test"]}
101
+ data_dir = dl_manager.download_and_extract(urls)
102
+ return [
103
+ datasets.SplitGenerator(
104
+ name=datasets.Split.TEST,
105
+ # These kwargs will be passed to _generate_examples
106
+ gen_kwargs={"filepath": data_dir["test"], "split": "test"},
107
+ ),
108
+ datasets.SplitGenerator(
109
+ name=datasets.Split.VALIDATION,
110
+ # These kwargs will be passed to _generate_examples
111
+ gen_kwargs={
112
+ "filepath": data_dir["validation"],
113
+ "split": "validation",
114
+ },
115
+ ),
116
+ ]
117
+
118
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
119
+ def _generate_examples(self, filepath, split):
120
+ with open(filepath, encoding="utf-8") as f:
121
+ for key, row in enumerate(f):
122
+ data = json.loads(row)
123
+ if data["meta"]["pile_set_name"] == _NAMES[self.config.name]:
124
+ yield key, {
125
+ "text": data["text"],
126
+ }
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (175 Bytes). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/__pycache__/quac.cpython-310.pyc ADDED
Binary file (3.15 kB). View file
 
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/dataset_infos.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/quac/quac.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # TODO: Address all TODOs and remove all explanatory comments
15
+ """QuAC dataset."""
16
+
17
+
18
+ import json
19
+
20
+ import datasets
21
+
22
+
23
+ _CITATION = """\
24
+ @article{choi2018quac,
25
+ title={Quac: Question answering in context},
26
+ author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
27
+ journal={arXiv preprint arXiv:1808.07036},
28
+ year={2018}
29
+ }
30
+ """
31
+
32
+ _DESCRIPTION = """\
33
+ Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
34
+ participating in information seeking dialog. Data instances consist of an interactive
35
+ dialog between two crowd workers: (1) a student who poses a sequence of freeform
36
+ questions to learn as much as possible about a hidden Wikipedia text, and (2)
37
+ a teacher who answers the questions by providing short excerpts (spans) from the text.
38
+ """
39
+
40
+ _HOMEPAGE = "https://quac.ai/"
41
+
42
+ # TODO: Add the licence for the dataset here if you can find it
43
+ _LICENSE = ""
44
+
45
+ _URLS = {
46
+ "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json",
47
+ "validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json",
48
+ }
49
+
50
+
51
+ class Quac(datasets.GeneratorBasedBuilder):
52
+ """Question Answering in Context (QuAC) is a dataset for modeling, understanding, and participating in information seeking dialog."""
53
+
54
+ VERSION = datasets.Version("1.1.0")
55
+
56
+ BUILDER_CONFIGS = [
57
+ datasets.BuilderConfig(
58
+ name="quac", version=VERSION, description="The QuAC dataset"
59
+ ),
60
+ ]
61
+
62
+ def _info(self):
63
+ features = datasets.Features(
64
+ {
65
+ "title": datasets.Value("string"),
66
+ "section_title": datasets.Value("string"),
67
+ "paragraph": datasets.Value("string"),
68
+ "question": datasets.Value("string"),
69
+ "answer": datasets.Value("string"),
70
+ }
71
+ )
72
+ return datasets.DatasetInfo(
73
+ description=_DESCRIPTION,
74
+ features=features,
75
+ homepage=_HOMEPAGE,
76
+ license=_LICENSE,
77
+ citation=_CITATION,
78
+ )
79
+
80
+ def _split_generators(self, dl_manager):
81
+ urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
82
+ data_dir = dl_manager.download_and_extract(urls)
83
+ return [
84
+ datasets.SplitGenerator(
85
+ name=datasets.Split.TRAIN,
86
+ # These kwargs will be passed to _generate_examples
87
+ gen_kwargs={
88
+ "filepath": data_dir["train"],
89
+ "split": "train",
90
+ },
91
+ ),
92
+ datasets.SplitGenerator(
93
+ name=datasets.Split.VALIDATION,
94
+ # These kwargs will be passed to _generate_examples
95
+ gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
96
+ ),
97
+ ]
98
+
99
+ # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
100
+ def _generate_examples(self, filepath, split):
101
+ with open(filepath, encoding="utf-8") as f:
102
+ data = json.load(f)["data"]
103
+ key = 0
104
+ for row in data:
105
+ paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
106
+ qas = row["paragraphs"][0]["qas"]
107
+ qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
108
+ for (question, answer) in qa_pairs:
109
+ # Yields examples as (key, example) tuples
110
+ yield key, {
111
+ "title": row["title"],
112
+ "section_title": row["section_title"],
113
+ "paragraph": paragraph,
114
+ "question": question,
115
+ "answer": answer,
116
+ }
117
+ key += 1
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__init__.py ADDED
File without changes
scripts/yans/eval/lm-evaluation-harness/lm_eval/datasets/sat_analogies/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (184 Bytes). View file