pminervini commited on
Commit
7e267bf
1 Parent(s): 88838f6
src/backend/envs.py CHANGED
@@ -37,6 +37,9 @@ class Tasks(Enum):
37
 
38
  task10 = Task("memo-trap", "acc", "memo-trap", 0)
39
 
 
 
 
40
  # NUM_FEWSHOT = 64 # Change with your few shot
41
 
42
 
 
37
 
38
  task10 = Task("memo-trap", "acc", "memo-trap", 0)
39
 
40
+ task11 = Task("nq8", "em", "NQ Open 8", 8)
41
+ task12 = Task("tqa8", "em", "TriviaQA 8", 8)
42
+
43
  # NUM_FEWSHOT = 64 # Change with your few shot
44
 
45
 
src/backend/tasks/nq8/README.md ADDED
File without changes
src/backend/tasks/nq8/nq8.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: nq8
2
+ dataset_path: nq_open
3
+ output_type: generate_until
4
+ training_split: train
5
+ validation_split: validation
6
+ description: "Answer these questions:\n"
7
+ doc_to_text: "Q: {{question}}?\nA:"
8
+ doc_to_target: "{{answer}}" # TODO: should be multi-target
9
+ fewshot_delimiter: "\n"
10
+ generation_kwargs:
11
+ until:
12
+ - "\n"
13
+ - "."
14
+ - ","
15
+ do_sample: false
16
+ temperature: 0.0
17
+ filter_list:
18
+ - name: remove_whitespace
19
+ filter:
20
+ - function: remove_whitespace
21
+ - function: take_first
22
+ target_delimiter: " "
23
+ metric_list:
24
+ - metric: exact_match
25
+ aggregation: mean
26
+ higher_is_better: true
27
+ ignore_case: true
28
+ ignore_punctuation: true
29
+ regexes_to_ignore:
30
+ - "\ban|a|the\b"
31
+ metadata:
32
+ - version: 0.0
src/backend/tasks/tqa8/README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Trivia QA
2
+
3
+ ### Paper
4
+
5
+ Title: `TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension`
6
+ Abstract: https://arxiv.org/abs/1705.03551
7
+
8
+ TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
9
+ triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
10
+ and independently gathered evidence documents, six per question on average, that provide
11
+ high quality distant supervision for answering the questions.
12
+
13
+ Homepage: https://nlp.cs.washington.edu/triviaqa/
14
+
15
+
16
+ ### Citation
17
+
18
+ ```
19
+ @InProceedings{JoshiTriviaQA2017,
20
+ author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
21
+ title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
22
+ booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
23
+ month = {July},
24
+ year = {2017},
25
+ address = {Vancouver, Canada},
26
+ publisher = {Association for Computational Linguistics},
27
+ }
28
+ ```
29
+
30
+ ### Groups and Tasks
31
+
32
+ #### Groups
33
+
34
+ * Not part of a group yet.
35
+
36
+ #### Tasks
37
+
38
+ * `triviaqa`: `Generate and answer based on the question.`
39
+
40
+ ### Checklist
41
+
42
+ For adding novel benchmarks/datasets to the library:
43
+ * [ ] Is the task an existing benchmark in the literature?
44
+ * [ ] Have you referenced the original paper that introduced the task?
45
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
46
+
47
+
48
+ If other tasks on this dataset are already supported:
49
+ * [ ] Is the "Main" variant of this task clearly denoted?
50
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
51
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
src/backend/tasks/tqa8/tqa8.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: tqa8
2
+ dataset_path: trivia_qa
3
+ dataset_name: rc.nocontext
4
+ output_type: generate_until
5
+ training_split: train
6
+ validation_split: validation
7
+ doc_to_text: "Question: {{question}}?\nAnswer:"
8
+ doc_to_target: "{{answer.aliases}}"
9
+ should_decontaminate: true
10
+ doc_to_decontamination_query: question
11
+ generation_kwargs:
12
+ until:
13
+ - "\n"
14
+ - "."
15
+ - ","
16
+ do_sample: false
17
+ temperature: 0.0
18
+ filter_list:
19
+ - name: remove_whitespace
20
+ filter:
21
+ - function: remove_whitespace
22
+ - function: take_first
23
+ target_delimiter: " "
24
+ metric_list:
25
+ - metric: exact_match
26
+ aggregation: mean
27
+ higher_is_better: true
28
+ ignore_case: true
29
+ ignore_punctuation: true
30
+ metadata:
31
+ - version: 2.0