File size: 8,496 Bytes
ddf7523
 
 
 
27df543
 
 
 
 
 
 
 
 
 
acec199
 
 
 
27df543
 
ddf7523
 
a3f4230
27df543
a3f4230
27df543
 
 
 
 
a3f4230
27df543
 
 
 
a3f4230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27df543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a5c0ef
27df543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3f4230
27df543
 
 
 
ddf7523
27df543
ddf7523
27df543
 
 
 
 
a3f4230
27df543
 
 
 
 
a3f4230
27df543
 
 
 
 
 
 
a3f4230
27df543
a3f4230
 
 
 
27df543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ddf7523
27df543
 
 
 
 
 
 
 
 
 
 
 
 
ddf7523
27df543
ddf7523
27df543
ddf7523
27df543
 
ddf7523
27df543
 
 
 
 
 
 
 
a3f4230
27df543
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""Lilac deployer streamlit UI.

This powers: https://huggingface.co/spaces/lilacai/lilac_deployer
"""

from typing import Literal, Optional, Union

import lilac as ll
import streamlit as st
from datasets import load_dataset_builder

if 'current_page' not in st.session_state:
  st.session_state.current_page = 'dataset'

query_params = st.experimental_get_query_params()
if 'dataset' in query_params:
  st.session_state.hf_dataset_name = query_params['dataset'][0]


def _dataset_page():
  is_valid_dataset = False

  st.header('Deploy Lilac for a HuggingFace dataset to a space', anchor=False)
  st.subheader(
    'Step 1: select a dataset',
    divider='violet',
    anchor=False,
    help='For a list of datasets see: https://huggingface.co/datasets',
  )
  hf_dataset_name = st.text_input(
    'dataset id',
    help='Either in the format `user/dataset` or `dataset`, for example: `Open-Orca/OpenOrca`',
    placeholder='dataset or user/dataset',
    value=st.session_state.get('hf_dataset_name', None),
  )
  with st.expander('advanced options'):
    hf_config_name = st.text_input(
      'config',
      help='Some datasets required this field.',
      placeholder='(optional)',
      value=st.session_state.get('hf_config_name', None),
    )
    hf_split = st.text_input(
      'split',
      help='Loads all splits by default.',
      placeholder='(optional)',
      value=st.session_state.get('hf_split', None),
    )
    sample_size = st.number_input(
      'sample size',
      help='Number of rows to sample from the dataset, for each split.',
      placeholder='(optional)',
      min_value=1,
      step=1,
      key='sample_size',
      value=st.session_state.get('sample_size', None),
    )
    hf_read_token = st.text_input(
      'huggingface [read token](https://huggingface.co/settings/tokens)',
      type='password',
      help='The access token is used to authenticate you with HuggingFace to read the dataset. '
      'https://huggingface.co/docs/hub/security-tokens',
      placeholder='(optional if dataset is public)',
    )

  def _next():
    st.session_state.current_page = 'space'
    st.session_state.hf_dataset_name = hf_dataset_name
    st.session_state.hf_config_name = hf_config_name
    st.session_state.hf_split = hf_split
    st.session_state.sample_size = sample_size

  def _next_button():
    enabled = is_valid_dataset
    return st.button('Next', disabled=not enabled, type='primary', on_click=_next)

  ds_builder = None
  if hf_dataset_name:
    is_valid_dataset = False
    try:
      ds_builder = load_dataset_builder(hf_dataset_name, name=hf_config_name, token=hf_read_token)
      is_valid_dataset = True
    except Exception as e:
      st.session_state.ds_error = e
      st.session_state.ds_loaded = False

  st.session_state.hf_dataset_name = hf_dataset_name

  _next_button()

  if ds_builder:
    st.session_state.ds_loaded = True
    st.session_state.ds_error = None
    st.session_state.ds_dataset_name = hf_dataset_name
    st.session_state.ds_description = ds_builder.info.description
    st.session_state.ds_features = ds_builder.info.features
    st.session_state.ds_splits = ds_builder.info.splits
  else:
    st.session_state.ds_loaded = False


def _space_page():
  session = dict(st.session_state)

  def _back():
    st.session_state.hf_space_name = hf_space_name
    st.session_state.hf_storage = hf_storage
    st.session_state.hf_access_token = hf_access_token
    st.session_state.current_page = 'dataset'

  hf_space_name = st.session_state.get('hf_space_name', None)
  hf_storage = st.session_state.get('hf_storage', None)
  hf_access_token = st.session_state.get('hf_access_token', None)

  def _back_button():
    return st.button('⬅ Back', on_click=_back)

  _back_button()
  st.subheader(
    'Step 2: create huggingface space',
    divider='violet',
    anchor=False,
    help='See HuggingFace Spaces [documentation](https://huggingface.co/docs/hub/spaces-overview)',
  )
  if session.get('hf_config_name', None):
    st.write(f'Config: {session["hf_config_name"]}')
  if st.session_state.get('hf_split', None):
    st.write(f'Split: {session["hf_split"]}')
  if st.session_state.get('sample_size', None):
    st.write(f'Sample size: {session["sample_size"]}')

  hf_space_name = st.text_input(
    'space id',
    help='This space will be created if it does not exist',
    placeholder='org/name',
    value=hf_space_name,
  )
  hf_access_token = st.text_input(
    'huggingface [write token](https://huggingface.co/settings/tokens)',
    type='password',
    help='The access token is used to authenticate you with HuggingFace to create the space. '
    'https://huggingface.co/docs/hub/security-tokens',
    value=hf_access_token,
  )
  storage_options = ['None', 'small', 'medium', 'large']
  hf_storage = st.selectbox(
    'persistent storage',
    ['None', 'small', 'medium', 'large'],
    help='Persistent storage is required if you want data to persist past the lifetime of the '
    'space docker image. This is recommended when running computations like signals or embeddings,'
    'or if you want labels to persist. You will get charged for persistent storage. See '
    'https://huggingface.co/docs/hub/spaces-storage',
    index=storage_options.index(hf_storage if hf_storage else 'None'),
  )

  def _deploy_button():
    enabled = hf_access_token and hf_space_name
    return st.button('Deploy', disabled=not enabled, on_click=_deploy)

  def _deploy():
    hf_dataset_name = st.session_state['hf_dataset_name']
    assert hf_space_name and hf_access_token and hf_dataset_name

    hf_config_name = st.session_state.get('hf_config_name', None)
    hf_split = st.session_state.get('hf_split', None)
    sample_size = st.session_state.get('sample_size', None)

    hf_space_storage: Optional[Union[Literal['small'], Literal['medium'], Literal['large']]]
    if hf_storage == 'None':
      hf_space_storage = None
    else:
      assert hf_storage == 'small' or hf_storage == 'medium' or hf_storage == 'large'
      hf_space_storage = hf_storage

    try:
      space_link = ll.deploy_config(
        hf_space=hf_space_name,
        create_space=True,
        hf_space_storage=hf_space_storage,
        config=ll.Config(
          datasets=[
            ll.DatasetConfig(
              namespace='local',
              name=hf_dataset_name.replace('/', '_'),
              source=ll.HuggingFaceSource(
                dataset_name=hf_dataset_name,
                config_name=hf_config_name,
                split=hf_split,
                sample_size=int(sample_size) if sample_size else None,
                token=hf_access_token,
              ),
            )
          ]
        ),
        hf_token=hf_access_token,
      )
      st.session_state.space_link = space_link
      st.session_state.current_page = 'success'
    except Exception as e:
      st.subheader('Deployment failed!', divider='red')
      st.error(e)

  _deploy_button()


def _success_page():
  space_link = st.session_state.space_link

  st.subheader('Success!', divider='green')
  st.subheader(f'[Visit your HuggingFace space ↗]({space_link})')
  st.write(
    'Spaces are private by default. '
    f'To make them public, visit the [Space settings]({space_link}/settings). '
  )


if st.session_state.current_page == 'dataset':
  _dataset_page()
elif st.session_state.current_page == 'space':
  _space_page()
elif st.session_state.current_page == 'success':
  _success_page()

# Sidebar content.
dataset_name = st.session_state.get('ds_dataset_name', None) or st.session_state.get(
  'hf_dataset_name', None
)
if st.session_state.get('ds_loaded', False):
  st.sidebar.header(
    f'[{dataset_name}](https://huggingface.co/datasets/{dataset_name})',
    divider='rainbow',
    anchor=False,
    help='Dataset information from HuggingFace datasets.',
  )

  st.sidebar.write(st.session_state.get('ds_description', None))

  st.sidebar.write('##### Features')
  st.sidebar.table(st.session_state.get('ds_features', {}))

  st.sidebar.write('##### Splits')
  st.sidebar.table(st.session_state.get('ds_splits', {}))
else:
  if st.session_state.get('ds_error', None):
    st.sidebar.subheader(f'Error loading `{dataset_name}`', divider='red', anchor=False)
    st.sidebar.error(st.session_state.get('ds_error', None))
    st.sidebar.write(
      'If the dataset is private, make sure to enter a HuggingFace '
      'token that has access to the dataset.'
    )
  else:
    st.sidebar.write('Choose a dataset to see more info..')