Spaces:

mideind
/

icelandic-llm-leaderboard

Running

gardarjuto commited on Jul 15

Commit

fa8bb65

•

1 Parent(s): 7fdb5f5

Add WikiQA

Files changed (1) hide show

src/about.py CHANGED Viewed

@@ -17,6 +17,7 @@ class Tasks(Enum):
     task2 = Task("icelandic_inflection_all", "exact_match,get-answer", "Inflection (1-shot)")
     task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele (IS)")
     task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC-Challenge-IS")
 # ---------------------------------------------------
@@ -64,5 +65,11 @@ This is the Icelandic subset (900 examples) of the Belebele benchmark, a multipl
 A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.
 - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-arc-challenge
 """

     task2 = Task("icelandic_inflection_all", "exact_match,get-answer", "Inflection (1-shot)")
     task5 = Task("icelandic_belebele", "exact_match,get-answer", "Belebele (IS)")
     task6 = Task("icelandic_arc_challenge", "exact_match,get-answer", "ARC-Challenge-IS")
+    task7 = Task("icelandic_wiki_qa", "lm_judge_score,get-answer", "WikiQA-IS")
 # ---------------------------------------------------
 A machine-translated version of the ARC-Challenge multiple-choice question-answering dataset. For this benchmark, we use the test set which contains 1.23k examples.
 - Link to dataset: https://huggingface.co/datasets/mideind/icelandic-arc-challenge
+### WikiQA-IS
+The Icelandic WikiQA dataset is a collection of 1.1k question-answer pairs from the Icelandic Wikipedia, meant to evaluate models' knowledge of Icelandic culture and history.
+They were collected by making GPT-4o generate questions and anwswers
+given Icelandic Wikipedia articles as context. All examples were then manually verified and corrected where necessary. For evaluation, we prompt GPT-4o to
+compare the generated answer to the original answer for semantic similarity and rate the answer on the following scale: (0, "poor"), (1, "fair"), (2, "excellent").
+- Link to dataset: https://huggingface.co/datasets/mideind/icelandic_wiki_qa
 """