File size: 5,199 Bytes
e987d7b
b47dcdb
e027770
e987d7b
e027770
e987d7b
 
 
 
3531f81
 
 
 
 
 
 
e027770
 
3531f81
 
 
 
 
 
 
 
 
 
 
899ca8c
3531f81
899ca8c
3531f81
 
899ca8c
3531f81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899ca8c
3531f81
 
 
 
 
 
 
899ca8c
3531f81
 
 
 
 
 
 
899ca8c
 
3531f81
 
 
 
 
 
 
899ca8c
3531f81
 
 
 
 
 
 
e027770
 
 
 
 
 
 
 
 
 
24b90ae
 
 
 
 
e027770
 
 
 
 
 
 
 
 
 
 
 
3531f81
 
 
 
 
 
 
 
 
 
e027770
 
 
3531f81
 
 
 
 
 
e027770
 
 
3531f81
 
 
 
 
 
 
3eab446
3531f81
 
 
e027770
811d4d9
e027770
 
 
 
 
 
 
3531f81
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Start by setting token and debug mode before starting schedulers
import os

from huggingface_hub import logging, login

login(token=os.environ.get("HF_TOKEN"), write_permission=True)
logging.set_verbosity_debug()

# Start apps
from pathlib import Path

import gradio as gr

from app_1M_image import get_demo as get_demo_1M_image
from app_image import get_demo as get_demo_image
from app_json import get_demo as get_demo_json
from app_parquet import get_demo as get_demo_parquet


def _get_demo_code(path: str) -> str:
    code = Path(path).read_text()
    code = code.replace("def get_demo():", "with gr.Blocks() as demo:")
    code += "\n\ndemo.launch()"
    return code


DEMO_EXPLANATION = """
<h1 style='text-align: center; margin-bottom: 1rem'> How to persist data from a Space to a Dataset? </h1>

This demo shows how to leverage `gradio` and `huggingface_hub` to save data from a Space to a Dataset on the Hub.
When doing so, a few things must be taken care of: file formats, concurrent writes, name collision, number of commits,
number of files, and more. The tabs below show different ways of implementing a "save to dataset" feature. Depending on the
complexity and usage of your app, you might want to use one or the other.

This Space comes as a demo for this `huggingface_hub` [guide](https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads). Please check it out if you need more technical details.
"""

JSON_DEMO_EXPLANATION = """
## Use case

- Save inputs and outputs
- Build an annotation platform

## Data

Json-able only: text and numeric but no binaries.

## Robustness

Works with concurrent users and replicas.

## Limitations

If you expect millions of lines, you must split the local JSON file into multiple files to avoid getting your file tracked as LFS (5MB) on the Hub.

## Demo
"""

IMAGE_DEMO_EXPLANATION = """
## Use case

Save images with metadata (caption, parameters, datetime, etc.).

## Robustness

Works with concurrent users and replicas.

## Limitations

  - only 10k images/folder are supported on the Hub. If you expect more usage, you must save data in subfolders.
  - only 1M images/repo supported on the Hub. If you expect more usage, you can zip your data before uploading. See the _1M images Dataset_ demo.

## Demo
"""

IMAGE_1M_DEMO_EXPLANATION = """
## Use case:

Save 1M images with metadata (caption, parameters, datetime, etc.).

## Robustness

Works with concurrent users and replicas.

## Limitations

Only 1 image per row. This is fine for most image datasets. However in some cases you might want to save multiple images per row
(e.g. generate 4 images and select the preferred one). In this case, you must encode how the dataset must be saved, as
a parquet file. Please have a look to the Parquet example for more details.

## Demo
"""

PARQUET_DEMO_EXPLANATION = """
## Use case:

Save any arbitrary dataset, no matter its size or format. If well configured, your dataset will be directly loadable with the `datasets` library
and benefit from the dataset-preview on the Hub.

Each row can contain metadata (text, numbers, datetimes,...) as well as binary data (images, audio, video,...).
This is particularly useful for datasets with multiple binary files for each row:

- Generate multiple images and select preferred one.
- Take audio as input, generate a translated audio as output.

## Robustness

Works with concurrent users and replicas.

## Limitations

None. Implementation of the ParquetScheduler requires slightly more work but you get full control over the data that is
pushed to the Hub.

## Demo
"""

with gr.Blocks() as demo:
    gr.Markdown(DEMO_EXPLANATION)

    with gr.Tab("JSON Dataset"):
        gr.Markdown(JSON_DEMO_EXPLANATION)
        get_demo_json()
        gr.Markdown(
            "## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-json\n\n## Code"
        )
        with gr.Accordion("Source code", open=True):
            gr.Code(_get_demo_code("app_json.py"), language="python")

    with gr.Tab("Image Dataset"):
        gr.Markdown(IMAGE_DEMO_EXPLANATION)
        get_demo_image()
        gr.Markdown(
            "## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-image\n\n## Code"
        )
        with gr.Accordion("Source code", open=True):
            gr.Code(_get_demo_code("app_image.py"), language="python")

    with gr.Tab("1M images Dataset"):
        gr.Markdown(IMAGE_1M_DEMO_EXPLANATION)
        get_demo_1M_image()
        gr.Markdown(
            "## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-image-zip\n\n## Code"
        )
        with gr.Accordion("Source code", open=True):
            gr.Code(_get_demo_code("app_1M_image.py"), language="python")

    with gr.Tab("Parquet Dataset (e.g. save user preferences)"):
        gr.Markdown(PARQUET_DEMO_EXPLANATION)
        get_demo_parquet()
        gr.Markdown(
            "## Result\n\nhttps://huggingface.co/datasets/Wauplin/example-space-to-dataset-parquet\n\n## Code"
        )
        with gr.Accordion("Source code", open=True):
            gr.Code(_get_demo_code("app_parquet.py"), language="python")
demo.launch()