gardarjuto
commited on
Commit
•
fa8bb65
1
Parent(s):
7fdb5f5
Add WikiQA
Browse files- src/about.py +7 -0
src/about.py
CHANGED
@@ -17,6 +17,7 @@ class Tasks(Enum):
|
|
17 |
task2 = Task("icelandic_inflection_all", "exact_match,get-answer", "Inflection (1-shot)")
|
18 |
task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele (IS)")
|
19 |
task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC-Challenge-IS")
|
|
|
20 |
|
21 |
# ---------------------------------------------------
|
22 |
|
@@ -64,5 +65,11 @@ This is the Icelandic subset (900 examples) of the Belebele benchmark, a multipl
|
|
64 |
A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.
|
65 |
- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-arc-challenge
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
"""
|
68 |
|
|
|
17 |
task2 = Task("icelandic_inflection_all", "exact_match,get-answer", "Inflection (1-shot)")
|
18 |
task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele (IS)")
|
19 |
task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC-Challenge-IS")
|
20 |
+
task7 = Task("icelandic_wiki_qa", "lm_judge_score,get-answer", "WikiQA-IS")
|
21 |
|
22 |
# ---------------------------------------------------
|
23 |
|
|
|
65 |
A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.
|
66 |
- Link to dataset: https://huggingface.co/datasets/mideind/icelandic-arc-challenge
|
67 |
|
68 |
+
### WikiQA-IS
|
69 |
+
The Icelandic WikiQA dataset is a collection of 1.1k question-answer pairs from the Icelandic Wikipedia, meant to evaluate models' knowledge of Icelandic culture and history.
|
70 |
+
They were collected by making GPT-4o generate questions and anwswers
|
71 |
+
given Icelandic Wikipedia articles as context. All examples were then manually verified and corrected where necessary. For evaluation, we prompt GPT-4o to
|
72 |
+
compare the generated answer to the original answer for semantic similarity and rate the answer on the following scale: (0, "poor"), (1, "fair"), (2, "excellent").
|
73 |
+
- Link to dataset: https://huggingface.co/datasets/mideind/icelandic_wiki_qa
|
74 |
"""
|
75 |
|