from typing import Optional import gradio as gr import pandas as pd import pyarrow.parquet as pq from gradio_huggingfacehub_search import HuggingfaceHubSearch from huggingface_hub import HfFileSystem css = """ .settings { background: transparent; } .settings button span { color: var(--body-text-color-subdued); } """ with gr.Blocks(css=css) as demo: with gr.Row(): with gr.Column(scale=10): gr.Markdown("# 👀 Parquet Viewer 📚") gr.Markdown("View the content of Parquet files inside a dataset repository or pull request.") dataset_search = HuggingfaceHubSearch( label="Hub Dataset ID", placeholder="Search for dataset id on Huggingface", search_type="dataset", ) with gr.Row(): revision_dropdown = gr.Dropdown("main", label="Revision", allow_custom_value=True) parquet_file_dropdown = gr.Dropdown(label="Parquet file", allow_custom_value=True) gr.Markdown("Parquet content:") output_dataframe = gr.DataFrame() with gr.Column(scale=4, min_width="200px"): with gr.Accordion("Settings", open=False, elem_classes="settings"): gr.Markdown("Access private/gated repos") gr.LoginButton() @dataset_search.change(inputs=[dataset_search], outputs=[revision_dropdown, parquet_file_dropdown, output_dataframe]) def dataset_update(dataset, oauth_token: Optional[gr.OAuthToken] = None): fs = HfFileSystem(token=oauth_token) if "/" not in dataset: return {revision_dropdown: gr.Dropdown(choices=[], value="", info="")} try: prs = [f"{dataset}@refs/pr/{pr.num}" for pr in fs._api.get_repo_discussions(dataset, repo_type="dataset", discussion_type="pull_request")] revision = f"{dataset}@main" return {revision_dropdown: gr.Dropdown(choices=[revision] + prs, value=revision, info=f"{len(prs)} pull request{'s' if len(prs) > 1 else ''} available" if prs else None)} except Exception: return {revision_dropdown: gr.Dropdown(choices=[], value="", info="no revisions available")} @revision_dropdown.change(inputs=[revision_dropdown], outputs=[parquet_file_dropdown, output_dataframe]) def revision_update(dataset_and_revision, oauth_token: Optional[gr.OAuthToken] = None): fs = HfFileSystem(token=oauth_token) try: parquet_files = ["hf://" + path for path in fs.glob(f"datasets/{dataset_and_revision}/**/*.parquet")] parquet_file = parquet_files[0] if parquet_files else None return {parquet_file_dropdown: gr.Dropdown(choices=parquet_files, value=parquet_file, info=f"{len(parquet_files)} parquet file{'s' if len(parquet_files) > 1 else ''} available")} except Exception: return {parquet_file_dropdown: gr.Dropdown(choices=[], value="", info="")} @parquet_file_dropdown.change(inputs=[parquet_file_dropdown], outputs=[output_dataframe]) def parquet_file_update(parquet_file, oauth_token: Optional[gr.OAuthToken] = None): fs = HfFileSystem(token=oauth_token) try: return {output_dataframe: pd.DataFrame([{k: str(v)[:1000] for k, v in x.items()} for x in pq.ParquetFile(parquet_file, filesystem=fs).read_row_group(0).to_pylist()] if parquet_file else [])} except Exception: return {output_dataframe: []} demo.launch()