|
import os |
|
from datetime import datetime |
|
|
|
from dotenv import load_dotenv |
|
from httpx import Client, AsyncClient |
|
from huggingface_hub import HfApi |
|
from huggingface_hub.utils import logging |
|
from tqdm.auto import tqdm |
|
from typing import Any, Dict, List |
|
import pandas as pd |
|
|
|
load_dotenv() |
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables" |
|
USER_AGENT = os.getenv("USER_AGENT") |
|
assert ( |
|
USER_AGENT is not None |
|
), "You need to set USER_AGENT in your environment variables" |
|
|
|
logger = logging.get_logger(__name__) |
|
headers = { |
|
"authorization": f"Bearer ${HF_TOKEN}", |
|
"user-agent": USER_AGENT, |
|
} |
|
client = Client(headers=headers) |
|
async_client = AsyncClient(headers=headers) |
|
api = HfApi(token=HF_TOKEN) |
|
|
|
|
|
def has_card_data(dataset): |
|
return hasattr(dataset, "card_data") |
|
|
|
|
|
def check_dataset_has_dataset_info(dataset): |
|
return bool( |
|
has_card_data(dataset) |
|
and hasattr(dataset.card_data, "dataset_info") |
|
and dataset.card_data.dataset_info is not None |
|
) |
|
|
|
|
|
def parse_single_config_dataset(data): |
|
config_name = data.get("config_name", "default") |
|
features = data.get("features", []) |
|
column_names = [feature.get("name") for feature in features] |
|
return { |
|
"config_name": config_name, |
|
"column_names": column_names, |
|
"features": features, |
|
} |
|
|
|
|
|
def parse_multiple_config_dataset(data: List[Dict[str, Any]]): |
|
return [parse_single_config_dataset(d) for d in data] |
|
|
|
|
|
def parse_dataset(dataset): |
|
hub_id = dataset.id |
|
likes = dataset.likes |
|
downloads = dataset.downloads |
|
tags = dataset.tags |
|
created_at = dataset.created_at |
|
last_modified = dataset.last_modified |
|
license = dataset.card_data.license |
|
language = dataset.card_data.language |
|
return { |
|
"hub_id": hub_id, |
|
"likes": likes, |
|
"downloads": downloads, |
|
"tags": tags, |
|
"created_at": created_at, |
|
"last_modified": last_modified, |
|
"license": license, |
|
"language": language, |
|
} |
|
|
|
|
|
def parsed_column_info(dataset_info): |
|
if isinstance(dataset_info, dict): |
|
return [parse_single_config_dataset(dataset_info)] |
|
elif isinstance(dataset_info, list): |
|
return parse_multiple_config_dataset(dataset_info) |
|
return None |
|
|
|
|
|
def ensure_list_of_strings(value): |
|
if value is None: |
|
return [] |
|
if isinstance(value, list): |
|
return [str(item) for item in value] |
|
return [str(value)] |
|
|
|
|
|
def refresh_data() -> List[Dict[str, Any]]: |
|
|
|
now = datetime.now() |
|
|
|
if os.path.exists(f"datasets_{now.strftime('%Y-%m-%d')}.parquet"): |
|
df = pd.read_parquet(f"datasets_{now.strftime('%Y-%m-%d')}.parquet") |
|
return df.to_dict(orient="records") |
|
|
|
|
|
datasets = list(api.list_datasets(limit=None, full=True)) |
|
|
|
|
|
datasets = [ |
|
dataset for dataset in tqdm(datasets) if check_dataset_has_dataset_info(dataset) |
|
] |
|
|
|
parsed_datasets = [] |
|
for dataset in tqdm(datasets): |
|
try: |
|
datasetinfo = parse_dataset(dataset) |
|
column_info = parsed_column_info(dataset.card_data.dataset_info) |
|
parsed_datasets.extend({**datasetinfo, **info} for info in column_info) |
|
except Exception as e: |
|
print(f"Error processing dataset {dataset.id}: {e}") |
|
continue |
|
|
|
|
|
df = pd.DataFrame(parsed_datasets) |
|
|
|
|
|
df["license"] = df["license"].apply(ensure_list_of_strings) |
|
df["tags"] = df["tags"].apply(ensure_list_of_strings) |
|
df["language"] = df["language"].apply(ensure_list_of_strings) |
|
|
|
|
|
df["features"] = df["features"].apply(lambda x: str(x) if x is not None else None) |
|
df = df.astype({"hub_id": "string", "config_name": "string"}) |
|
|
|
|
|
df.to_parquet(f"datasets_{now.strftime('%Y-%m-%d')}.parquet") |
|
|
|
|
|
df.to_json( |
|
f"datasets_{now.strftime('%Y-%m-%d')}.json", orient="records", lines=True |
|
) |
|
|
|
|
|
return df.to_dict(orient="records") |
|
|
|
|
|
if __name__ == "__main__": |
|
refresh_data() |
|
|