File size: 4,132 Bytes
1d123d6
c328472
 
1d123d6
 
c328472
1d123d6
 
c328472
 
 
1d123d6
a4421c2
c328472
1d123d6
 
c328472
1d123d6
 
c328472
1d123d6
 
c328472
 
 
 
1d123d6
c328472
 
 
 
 
1d123d6
c328472
1d123d6
 
 
a4421c2
1d123d6
 
 
 
a4421c2
c328472
1d123d6
 
 
c328472
 
 
 
 
1d123d6
a4421c2
 
e8dfa3e
 
 
 
 
 
a4421c2
 
1d123d6
 
a4421c2
c328472
1d123d6
a4421c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c328472
1d123d6
c328472
1d123d6
 
 
c328472
1d123d6
 
c328472
1d123d6
 
 
c328472
1d123d6
 
 
 
c328472
1d123d6
c328472
1d123d6
 
 
c328472
1d123d6
c328472
1d123d6
 
 
 
 
 
 
 
 
 
 
 
c328472
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import re

import gradio as gr
from huggingface_hub import get_collection


def extract_collection_id(input_text):
    if url_match := re.match(r"https://huggingface\.co/collections/(.+)$", input_text):
        return url_match[1]

    # Check if input is already in the correct format
    return input_text if re.match(r"^[\w-]+/[\w-]+", input_text) else None


def load_collection():
    collection_input = os.getenv("COLLECTION_SLUG_OR_URL")
    if not collection_input:
        raise ValueError("COLLECTION_SLUG_OR_URL environment variable is not set.")

    collection_id = extract_collection_id(collection_input)
    if not collection_id:
        raise ValueError(
            "Invalid collection ID or URL in COLLECTION_SLUG_OR_URL environment variable."
        )

    collection = get_collection(collection_id)
    if dataset_ids := [
        item.item_id for item in collection.items if item.item_type == "dataset"
    ]:
        return dataset_ids, collection_id
    else:
        raise ValueError("No datasets found in this collection.")


def display_dataset(dataset_ids, index):
    dataset_id = dataset_ids[index]
    return gr.HTML(f"""<iframe
    src="https://huggingface.co/datasets/{dataset_id}/embed/viewer"
    frameborder="0"
    width="100%"
    height="560px"
></iframe>""")


def navigate_dataset(dataset_ids, index, direction):
    new_index = (index + direction) % len(dataset_ids)
    return (
        new_index,
        f"Dataset {new_index + 1} of {len(dataset_ids)}: {dataset_ids[new_index]}",
    )


def get_display_name(collection_id):
    # Strip out the extra ID part of the name
    parts = collection_id.split("/")
    if len(parts) == 2:
        owner, name = parts
        name = re.sub(r"-[a-f0-9]{32}$", "", name)
        return f"{owner}/{name}"
    return collection_id  # Return original if not in expected format


try:
    dataset_ids, collection_id = load_collection()
    display_name = get_display_name(collection_id)

    with gr.Blocks() as demo:
        gr.Markdown(f"<h1>Dataset Viewer for Collection: {display_name}</h1>")
        gr.Markdown(
            f"[View full collection on Hugging Face](https://huggingface.co/collections/{collection_id})"
        )

        gr.Markdown("""
        This app allows you to browse and view datasets from a specific Hugging Face collection. 
        Use the 'Previous' and 'Next' buttons to navigate through the datasets in the collection.

        **Note**: This space is currently set up to display datasets from a specific collection. 
        If you'd like to use it for a different collection:
        1. Duplicate this space
        2. In your duplicated space, set the `COLLECTION_SLUG_OR_URL` environment variable to your desired collection ID or URL
        3. Your new space will then display datasets from your chosen collection!
        """)

        index_state = gr.State(value=0)

        with gr.Row():
            left_btn = gr.Button("Previous")
            right_btn = gr.Button("Next")

        dataset_info = gr.Markdown(f"Dataset 1 of {len(dataset_ids)}: {dataset_ids[0]}")
        iframe_output = gr.HTML()

        left_btn.click(
            navigate_dataset,
            inputs=[gr.State(dataset_ids), index_state, gr.Number(-1, visible=False)],
            outputs=[index_state, dataset_info],
        )
        right_btn.click(
            navigate_dataset,
            inputs=[gr.State(dataset_ids), index_state, gr.Number(1, visible=False)],
            outputs=[index_state, dataset_info],
        )

        index_state.change(
            display_dataset,
            inputs=[gr.State(dataset_ids), index_state],
            outputs=[iframe_output],
        )

        # Initialize the display with the first dataset
        demo.load(
            fn=lambda: display_dataset(dataset_ids, 0),
            inputs=None,
            outputs=[iframe_output],
        )

    if __name__ == "__main__":
        demo.launch()

except Exception as e:
    print(f"Error: {str(e)}")
    print(
        "Please set the COLLECTION_SLUG_OR_URL environment variable with a valid collection ID or URL."
    )