File size: 2,333 Bytes
14e4843
 
 
 
 
 
 
 
d6d7ec6
14e4843
 
 
 
 
d6d7ec6
14e4843
d6d7ec6
14e4843
 
 
 
d6d7ec6
14e4843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6d7ec6
14e4843
 
 
d6d7ec6
 
14e4843
 
d6d7ec6
14e4843
d6d7ec6
 
 
 
14e4843
d6d7ec6
14e4843
d6d7ec6
 
 
 
14e4843
d6d7ec6
14e4843
d6d7ec6
 
 
14e4843
 
 
 
 
d6d7ec6
14e4843
d6d7ec6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3

import random
import requests

from datasets import load_dataset, Dataset, DatasetDict


path = "pminervini/HaluEval"

API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}"
response = requests.get(API_URL)
res_json = response.json()

gold_splits = {"dialogue", "qa", "summarization", "general"}

available_splits = {split["config"] for split in res_json["splits"]} if "splits" in res_json else set()

name_to_ds = dict()

for name in gold_splits:
    ds = load_dataset("json", data_files={"data": f"data/{name}_data.json"})
    name_to_ds[name] = ds
    # if name not in available_splits:
    ds.push_to_hub(path, config_name=name)


def list_to_dict(lst: list) -> dict:
    res = dict()
    for entry in lst:
        for k, v in entry.items():
            if k not in res:
                res[k] = []
            res[k] += [v]
    return res


for name in gold_splits - {"general"}:
    random.seed(42)
    ds = name_to_ds[name]
    new_entry_lst = []

    for entry in ds["data"]:
        is_hallucinated = random.random() > 0.5
        new_entry = None
        if name in {"qa"}:
            new_entry = {
                "knowledge": entry["knowledge"],
                "question": entry["question"],
                "answer": entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'],
                "hallucination": "yes" if is_hallucinated else "no",
            }
        if name in {"dialogue"}:
            new_entry = {
                "knowledge": entry["knowledge"],
                "dialogue_history": entry["dialogue_history"],
                "response": entry[f'{"hallucinated" if is_hallucinated else "right"}_response'],
                "hallucination": "yes" if is_hallucinated else "no",
            }
        if name in {"summarization"}:
            new_entry = {
                "document": entry["document"],
                "summary": entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'],
                "hallucination": "yes" if is_hallucinated else "no",
            }
        assert new_entry is not None
        new_entry_lst += [new_entry]
    new_ds_map = list_to_dict(new_entry_lst)
    new_ds = Dataset.from_dict(new_ds_map)
    new_dsd = DatasetDict({"data": new_ds})

    new_dsd.push_to_hub(path, config_name=f"{name}_samples")