import asyncio import urllib from typing import Iterable import gradio as gr import markdown as md import pandas as pd from distilabel.cli.pipeline.utils import _build_pipeline_panel, get_pipeline from gradio_huggingfacehub_search import HuggingfaceHubSearch from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns from gradio_modal import Modal from huggingface_hub import HfApi, HfFileSystem, RepoCard from huggingface_hub.hf_api import DatasetInfo # Initialize the Hugging Face API api = HfApi() example = HuggingfaceHubSearch().example_value() fs = HfFileSystem() def _categorize_dtypes(df): dtype_mapping = { 'int64': 'number', 'float64': 'number', 'bool': 'bool', 'datetime64[ns]': 'date', 'datetime64[ns, UTC]': 'date', 'object': 'str' } categorized_dtypes = [] for column, dtype in df.dtypes.items(): dtype_str = str(dtype) if dtype_str in dtype_mapping: categorized_dtypes.append(dtype_mapping[dtype_str]) else: categorized_dtypes.append('markdown') return categorized_dtypes def _get_tag_category(entry: list[str], tag_category: str): for item in entry: if tag_category in item: return item.split(f"{tag_category}:")[-1] else: return None def _has_pipeline(repo_id): file_path = f"datasets/{repo_id}/pipeline.log" url = "https://huggingface.co/{file_path}" if fs.exists(file_path): pipeline = get_pipeline(url) return str(_build_pipeline_panel(pipeline)) else: return "" async def check_pipelines(repo_ids): tasks = [_has_pipeline(fs, repo_id) for repo_id in repo_ids] results = await asyncio.gather(*tasks) return dict(zip(repo_ids, results)) def _search_distilabel_repos(query: str = None,): filter = "library:distilabel" if query: filter = f"{filter}&search={urllib.urlencode(query)}" datasets: Iterable[DatasetInfo] = api.list_datasets(filter=filter) data = [ex.__dict__ for ex in datasets] df = pd.DataFrame.from_records(data) df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"]) # df["has_pipeline"] = asyncio.run(check_pipelines(df.id.tolist())) df["has_pipeline"] = "" subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description'] new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns] df = df[new_column_order] return df def _create_modal_info(row: dict) -> str: def _get_main_title(repo_id): return f'