Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| import random | |
| import requests | |
| from datasets import load_dataset, Dataset, DatasetDict | |
| path = 'pminervini/HaluEval' | |
| API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}" | |
| response = requests.get(API_URL) | |
| res_json = response.json() | |
| gold_splits = {'dialogue', 'qa', 'summarization', 'general'} | |
| available_splits = {split['config'] for split in res_json['splits']} if 'splits' in res_json else set() | |
| name_to_ds = dict() | |
| for name in gold_splits: | |
| ds = load_dataset("json", data_files={'data': f"data/{name}_data.json"}) | |
| name_to_ds[name] = ds | |
| # if name not in available_splits: | |
| ds.push_to_hub(path, config_name=name) | |
| def list_to_dict(lst: list) -> dict: | |
| res = dict() | |
| for entry in lst: | |
| for k, v in entry.items(): | |
| if k not in res: | |
| res[k] = [] | |
| res[k] += [v] | |
| return res | |
| for name in (gold_splits - {'general'}): | |
| random.seed(42) | |
| ds = name_to_ds[name] | |
| new_entry_lst = [] | |
| for entry in ds['data']: | |
| is_hallucinated = random.random() > 0.5 | |
| new_entry = None | |
| if name in {'qa'}: | |
| new_entry = { | |
| 'knowledge': entry['knowledge'], | |
| 'question': entry['question'], | |
| 'answer': entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'], | |
| 'hallucination': 'yes' if is_hallucinated else 'no' | |
| } | |
| if name in {'dialogue'}: | |
| new_entry = { | |
| 'knowledge': entry['knowledge'], | |
| 'dialogue_history': entry['dialogue_history'], | |
| 'response': entry[f'{"hallucinated" if is_hallucinated else "right"}_response'], | |
| 'hallucination': 'yes' if is_hallucinated else 'no' | |
| } | |
| if name in {'summarization'}: | |
| new_entry = { | |
| 'document': entry['document'], | |
| 'summary': entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'], | |
| 'hallucination': 'yes' if is_hallucinated else 'no' | |
| } | |
| assert new_entry is not None | |
| new_entry_lst += [new_entry] | |
| new_ds_map = list_to_dict(new_entry_lst) | |
| new_ds = Dataset.from_dict(new_ds_map) | |
| new_dsd = DatasetDict({'data': new_ds}) | |
| new_dsd.push_to_hub(path, config_name=f'{name}_samples') | |