Spaces:
Build error
Build error
# coding=utf-8 | |
import datasets | |
import requests | |
from promptsource.templates import INCLUDED_USERS | |
def removeHyphen(example): | |
example_clean = {} | |
for key in example.keys(): | |
if "-" in key: | |
new_key = key.replace("-", "_") | |
example_clean[new_key] = example[key] | |
else: | |
example_clean[key] = example[key] | |
example = example_clean | |
return example | |
def renameDatasetColumn(dataset): | |
col_names = dataset.column_names | |
for cols in col_names: | |
if "-" in cols: | |
dataset = dataset.rename_column(cols, cols.replace("-", "_")) | |
return dataset | |
# | |
# Helper functions for datasets library | |
# | |
def get_dataset_builder(path, conf=None): | |
"Get a dataset builder from name and conf." | |
module_path = datasets.load.prepare_module(path, dataset=True) | |
builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) | |
if conf: | |
builder_instance = builder_cls(name=conf, cache_dir=None, hash=module_path[1]) | |
else: | |
builder_instance = builder_cls(cache_dir=None, hash=module_path[1]) | |
return builder_instance | |
def get_dataset(path, conf=None): | |
"Get a dataset from name and conf." | |
builder_instance = get_dataset_builder(path, conf) | |
if builder_instance.manual_download_instructions is None and builder_instance.info.size_in_bytes is not None: | |
builder_instance.download_and_prepare() | |
return builder_instance.as_dataset() | |
else: | |
return datasets.load_dataset(path, conf) | |
def get_dataset_confs(path): | |
"Get the list of confs for a dataset." | |
module_path = datasets.load.prepare_module(path, dataset=True) | |
# Get dataset builder class from the processing script | |
builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) | |
# Instantiate the dataset builder | |
confs = builder_cls.BUILDER_CONFIGS | |
if confs and len(confs) > 1: | |
return confs | |
return [] | |
def render_features(features): | |
"""Recursively render the dataset schema (i.e. the fields).""" | |
if isinstance(features, dict): | |
return {k: render_features(v) for k, v in features.items()} | |
if isinstance(features, datasets.features.ClassLabel): | |
return features.names | |
if isinstance(features, datasets.features.Value): | |
return features.dtype | |
if isinstance(features, datasets.features.Sequence): | |
return {"[]": render_features(features.feature)} | |
return features | |
# | |
# Loads dataset information | |
# | |
def filter_english_datasets(): | |
""" | |
Filter English datasets based on language tags in metadata. | |
Also includes the datasets of any users listed in INCLUDED_USERS | |
""" | |
english_datasets = [] | |
response = requests.get("https://huggingface.co/api/datasets?full=true") | |
tags = response.json() | |
for dataset in tags: | |
dataset_name = dataset["id"] | |
is_community_dataset = "/" in dataset_name | |
if is_community_dataset: | |
user = dataset_name.split("/")[0] | |
if user in INCLUDED_USERS: | |
english_datasets.append(dataset_name) | |
continue | |
if "card_data" not in dataset: | |
continue | |
metadata = dataset["card_data"] | |
if "languages" not in metadata: | |
continue | |
languages = metadata["languages"] | |
if "en" in languages or "en-US" in languages: | |
english_datasets.append(dataset_name) | |
return sorted(english_datasets) | |
def list_datasets(template_collection, _state): | |
"""Get all the datasets to work with.""" | |
dataset_list = filter_english_datasets() | |
dataset_list.sort(key=lambda x: x.lower()) | |
return dataset_list | |