Upload tanadata.py
Browse files- tanadata.py +57 -0
tanadata.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import datasets
|
| 3 |
+
|
| 4 |
+
# You can update these with more detailed information.
|
| 5 |
+
_DESCRIPTION = """
|
| 6 |
+
TanaData is a custom dataset for instruction-response tasks.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
_CITATION = """
|
| 10 |
+
@misc{tanadata2025,
|
| 11 |
+
title={TanaData Dataset},
|
| 12 |
+
year={2025},
|
| 13 |
+
note={Custom dataset hosted on Hugging Face}
|
| 14 |
+
}
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
class TanaData(datasets.GeneratorBasedBuilder):
|
| 18 |
+
VERSION = datasets.Version("1.0.0")
|
| 19 |
+
|
| 20 |
+
def _info(self):
|
| 21 |
+
return datasets.DatasetInfo(
|
| 22 |
+
description=_DESCRIPTION,
|
| 23 |
+
features=datasets.Features({
|
| 24 |
+
"instruction": datasets.Value("string"),
|
| 25 |
+
"input": datasets.Value("string"),
|
| 26 |
+
"output": datasets.Value("string"),
|
| 27 |
+
}),
|
| 28 |
+
supervised_keys=None,
|
| 29 |
+
homepage="https://huggingface.co/mdevoz/tanadata",
|
| 30 |
+
citation=_CITATION,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
def _split_generators(self, dl_manager):
|
| 34 |
+
# This URL points to your JSON file in the repository.
|
| 35 |
+
file_path = dl_manager.download_and_extract(
|
| 36 |
+
"https://huggingface.co/mdevoz/tanadata/resolve/main/tana_z.json"
|
| 37 |
+
)
|
| 38 |
+
return [
|
| 39 |
+
datasets.SplitGenerator(
|
| 40 |
+
name=datasets.Split.TRAIN,
|
| 41 |
+
gen_kwargs={"filepath": file_path}
|
| 42 |
+
)
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
def _generate_examples(self, filepath):
|
| 46 |
+
# Adjust this logic based on your JSON file structure.
|
| 47 |
+
with open(filepath, encoding="utf-8") as f:
|
| 48 |
+
# If your file is a JSON array of examples:
|
| 49 |
+
data = json.load(f)
|
| 50 |
+
for idx, example in enumerate(data):
|
| 51 |
+
yield idx, example
|
| 52 |
+
|
| 53 |
+
# For testing, you can uncomment the following lines locally:
|
| 54 |
+
# if __name__ == "__main__":
|
| 55 |
+
# from datasets import load_dataset
|
| 56 |
+
# dataset = load_dataset(__file__, name="tanadata")
|
| 57 |
+
# print(dataset)
|