marcenacp commited on
Commit
f82850d
1 Parent(s): edf454b

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
Files changed (4) hide show
  1. core/files.py +5 -26
  2. events/resources.py +29 -0
  3. events/resources_test.py +19 -0
  4. views/files.py +88 -99
core/files.py CHANGED
@@ -11,8 +11,8 @@ from .names import find_unique_name
11
  from .state import FileObject
12
  from .state import FileSet
13
 
14
- FILE_OBJECT = "File object"
15
- FILE_SET = "File set"
16
  RESOURCE_TYPES = [FILE_OBJECT, FILE_SET]
17
 
18
 
@@ -131,32 +131,11 @@ def file_from_upload(
131
  )
132
 
133
 
134
- def file_from_form(
135
- file_type: FileType,
136
- type: str,
137
- name,
138
- description,
139
- sha256: str,
140
- contained_in: list[str],
141
- names: set[str],
142
- ) -> FileObject | FileSet:
143
  """Creates a file based on manually added fields."""
144
  if type == FILE_OBJECT:
145
- return FileObject(
146
- name=find_unique_name(names, name),
147
- description=description,
148
- content_url="",
149
- encoding_format=file_type.encoding_format,
150
- sha256=sha256,
151
- df=None,
152
- contained_in=contained_in,
153
- )
154
  elif type == FILE_SET:
155
- return FileSet(
156
- name=find_unique_name(names, name),
157
- description=description,
158
- encoding_format=file_type.encoding_format,
159
- contained_in=contained_in,
160
- )
161
  else:
162
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
 
11
  from .state import FileObject
12
  from .state import FileSet
13
 
14
+ FILE_OBJECT = "FileObject"
15
+ FILE_SET = "FileSet"
16
  RESOURCE_TYPES = [FILE_OBJECT, FILE_SET]
17
 
18
 
 
131
  )
132
 
133
 
134
+ def file_from_form(type: str, names: set[str]) -> FileObject | FileSet:
 
 
 
 
 
 
 
 
135
  """Creates a file based on manually added fields."""
136
  if type == FILE_OBJECT:
137
+ return FileObject(name=find_unique_name(names, "file_object"))
 
 
 
 
 
 
 
 
138
  elif type == FILE_SET:
139
+ return FileSet(name=find_unique_name(names, "file_set"))
 
 
 
 
 
140
  else:
141
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
events/resources.py CHANGED
@@ -1,7 +1,9 @@
 
1
  import enum
2
 
3
  import streamlit as st
4
 
 
5
  from core.state import FileObject
6
  from core.state import FileSet
7
  from core.state import Metadata
@@ -15,9 +17,12 @@ class ResourceEvent(enum.Enum):
15
  NAME = "NAME"
16
  DESCRIPTION = "DESCRIPTION"
17
  ENCODING_FORMAT = "ENCODING_FORMAT"
 
18
  SHA256 = "SHA256"
 
19
  CONTENT_SIZE = "CONTENT_SIZE"
20
  CONTENT_URL = "CONTENT_URL"
 
21
 
22
 
23
  def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
@@ -33,9 +38,33 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
33
  resource.description = value
34
  elif event == ResourceEvent.ENCODING_FORMAT:
35
  resource.encoding_format = value
 
 
36
  elif event == ResourceEvent.SHA256:
37
  resource.sha256 = value
 
 
38
  elif event == ResourceEvent.CONTENT_SIZE:
39
  resource.content_size = value
40
  elif event == ResourceEvent.CONTENT_URL:
41
  resource.content_url = value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
  import enum
3
 
4
  import streamlit as st
5
 
6
+ from core.files import FILE_OBJECT
7
  from core.state import FileObject
8
  from core.state import FileSet
9
  from core.state import Metadata
 
17
  NAME = "NAME"
18
  DESCRIPTION = "DESCRIPTION"
19
  ENCODING_FORMAT = "ENCODING_FORMAT"
20
+ INCLUDES = "INCLUDES"
21
  SHA256 = "SHA256"
22
+ CONTAINED_IN = "CONTAINED_IN"
23
  CONTENT_SIZE = "CONTENT_SIZE"
24
  CONTENT_URL = "CONTENT_URL"
25
+ TYPE = "TYPE"
26
 
27
 
28
  def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
 
38
  resource.description = value
39
  elif event == ResourceEvent.ENCODING_FORMAT:
40
  resource.encoding_format = value
41
+ elif event == ResourceEvent.INCLUDES:
42
+ resource.includes = value
43
  elif event == ResourceEvent.SHA256:
44
  resource.sha256 = value
45
+ elif event == ResourceEvent.CONTAINED_IN:
46
+ resource.contained_in = value
47
  elif event == ResourceEvent.CONTENT_SIZE:
48
  resource.content_size = value
49
  elif event == ResourceEvent.CONTENT_URL:
50
  resource.content_url = value
51
+ elif event == ResourceEvent.TYPE:
52
+ metadata: Metadata = st.session_state[Metadata]
53
+ index = metadata.distribution.index(resource)
54
+ # Changing type by trying to retain as many attributes as possible.
55
+ if value == FILE_OBJECT:
56
+ file_object = _create_instance1_from_instance2(resource, FileObject)
57
+ metadata.distribution[index] = file_object
58
+ else:
59
+ file_set = _create_instance1_from_instance2(resource, FileSet)
60
+ metadata.distribution[index] = file_set
61
+
62
+
63
+ def _create_instance1_from_instance2(instance1: Resource, instance2: type):
64
+ """Creates instance2 by retaining as many common attributes as possible."""
65
+ attributes1 = set((field.name for field in dataclasses.fields(instance1)))
66
+ attributes2 = set((field.name for field in dataclasses.fields(instance2)))
67
+ common_attributes = attributes2.intersection(attributes1)
68
+ return instance2(**{
69
+ attribute: getattr(instance1, attribute) for attribute in common_attributes
70
+ })
events/resources_test.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from core.state import FileObject
2
+ from core.state import FileSet
3
+
4
+ from .resources import _create_instance1_from_instance2
5
+
6
+
7
+ def test_create_instance1_from_instance2():
8
+ file_object = FileObject(
9
+ name="name",
10
+ description="description",
11
+ contained_in=["foo", "bar"],
12
+ content_url="mlcommons.com",
13
+ )
14
+ file_set = _create_instance1_from_instance2(file_object, FileSet)
15
+ assert isinstance(file_set, FileSet)
16
+ assert file_set.name == "name"
17
+ assert file_set.description == "description"
18
+ assert file_set.contained_in == ["foo", "bar"]
19
+ assert file_set.encoding_format == None
views/files.py CHANGED
@@ -6,6 +6,7 @@ from core.files import file_from_form
6
  from core.files import file_from_upload
7
  from core.files import file_from_url
8
  from core.files import FILE_OBJECT
 
9
  from core.files import FILE_TYPES
10
  from core.files import RESOURCE_TYPES
11
  from core.record_sets import infer_record_sets
@@ -27,14 +28,18 @@ _MANUAL_DESCRIPTION_KEY = "manual_object_description"
27
  _MANUAL_SHA256_KEY = "manual_object_sha256"
28
  _MANUAL_PARENT_KEY = "manual_object_parents"
29
 
 
 
 
 
30
 
31
  def render_files():
32
  col1, col2, col3 = st.columns([1, 1, 1], gap="small")
33
  with col1:
34
- st.subheader("Upload more resources")
35
  _render_upload_panel()
36
  with col2:
37
- st.subheader("Uploaded resources")
38
  files = st.session_state[Metadata].distribution
39
  resource = _render_resources_panel(files)
40
  st.session_state[SelectedResource] = resource
@@ -86,30 +91,7 @@ def _render_upload_panel():
86
  st.text_input("URL:", key=_DISTANT_URL_KEY)
87
 
88
  with tab3:
89
- resource_type = st.selectbox(
90
- "Type", options=RESOURCE_TYPES, key=_MANUAL_RESOURCE_TYPE_KEY
91
- )
92
- st.text_input(
93
- needed_field("File name"),
94
- key=_MANUAL_NAME_KEY,
95
- )
96
- st.text_area(
97
- "File description",
98
- placeholder="Provide a clear description of the file.",
99
- key=_MANUAL_DESCRIPTION_KEY,
100
- )
101
- st.text_input(
102
- "SHA256",
103
- key=_MANUAL_SHA256_KEY,
104
- )
105
- parent_options = [
106
- file.name for file in st.session_state[Metadata].distribution
107
- ]
108
- st.multiselect(
109
- "Parent",
110
- options=parent_options,
111
- key=_MANUAL_PARENT_KEY,
112
- )
113
 
114
  def handle_on_click():
115
  url = st.session_state[_DISTANT_URL_KEY]
@@ -123,29 +105,7 @@ def _render_upload_panel():
123
  file = file_from_upload(file_type, uploaded_file, names)
124
  else:
125
  resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
126
- needs_sha256 = resource_type == FILE_OBJECT
127
-
128
- name = st.session_state[_MANUAL_NAME_KEY]
129
- description = st.session_state[_MANUAL_DESCRIPTION_KEY]
130
- sha256 = st.session_state[_MANUAL_SHA256_KEY] if needs_sha256 else None
131
- parents = st.session_state[_MANUAL_PARENT_KEY]
132
- errorMessage = (
133
- "Please import either a local file, provide a download URL or fill"
134
- " in all required fields: name"
135
- )
136
- if needs_sha256:
137
- errorMessage += " and SHA256"
138
-
139
- if not name or (needs_sha256 and not sha256):
140
- # Some required fields are empty.
141
- st.toast(
142
- errorMessage,
143
- icon="❌",
144
- )
145
- return
146
- file = file_from_form(
147
- file_type, resource_type, name, description, sha256, parents, names
148
- )
149
 
150
  st.session_state[Metadata].add_distribution(file)
151
  record_sets = infer_record_sets(file, names)
@@ -161,70 +121,57 @@ def _render_right_panel():
161
  of the selected resource."""
162
  if st.session_state.get(SelectedResource):
163
  _render_resource_details(st.session_state[SelectedResource])
 
 
164
 
165
 
166
  def _render_resource_details(selected_file: Resource):
167
  """Renders the details of the selected resource."""
168
  file: FileObject | FileSet
169
- for key, file in enumerate(st.session_state[Metadata].distribution):
170
  if file.name == selected_file.name:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- if isinstance(file, FileObject):
173
- _render_file_object(key, file)
174
- else:
175
- _render_file_set(key, file)
176
 
177
  def delete_line():
178
- st.session_state[Metadata].remove_distribution(key)
179
 
180
- _, col = st.columns([5, 1])
181
- col.button("Remove", key=f"{key}_url", on_click=delete_line, type="primary")
182
 
 
 
 
 
 
183
 
184
- def _render_file_object(prefix: int, file: FileObject):
185
- key = f"{prefix}_name"
186
- st.text_input(
187
- needed_field("Name"),
188
- value=file.name,
189
- key=key,
190
- on_change=handle_resource_change,
191
- args=(ResourceEvent.NAME, file, key),
192
- )
193
- key = f"{prefix}_description"
194
- st.text_area(
195
- "Description",
196
- value=file.description,
197
- placeholder="Provide a clear description of the file.",
198
- key=key,
199
- on_change=handle_resource_change,
200
- args=(ResourceEvent.DESCRIPTION, file, key),
201
- )
202
- key = f"{prefix}_sha256"
203
- st.text_input(
204
- needed_field("SHA256"),
205
- value=file.sha256,
206
- disabled=True,
207
- key=key,
208
- on_change=handle_resource_change,
209
- args=(ResourceEvent.SHA256, file, key),
210
- )
211
- key = f"{prefix}_encoding"
212
- st.text_input(
213
- needed_field("Encoding format"),
214
- value=file.encoding_format,
215
- disabled=True,
216
  key=key,
217
  on_change=handle_resource_change,
218
- args=(ResourceEvent.ENCODING_FORMAT, file, key),
219
  )
220
- st.markdown("First rows of data:")
221
- if file.df is not None:
222
- st.dataframe(file.df, height=DF_HEIGHT)
223
- else:
224
- st.text("No rendering possible.")
225
-
226
-
227
- def _render_file_set(prefix: int, file: FileSet):
228
  key = f"{prefix}_name"
229
  st.text_input(
230
  needed_field("Name"),
@@ -242,12 +189,54 @@ def _render_file_set(prefix: int, file: FileSet):
242
  on_change=handle_resource_change,
243
  args=(ResourceEvent.DESCRIPTION, file, key),
244
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  key = f"{prefix}_encoding"
246
  st.text_input(
247
  needed_field("Encoding format"),
248
  value=file.encoding_format,
249
- disabled=True,
250
  key=key,
251
  on_change=handle_resource_change,
252
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
253
  )
 
 
 
 
 
 
 
 
 
 
6
  from core.files import file_from_upload
7
  from core.files import file_from_url
8
  from core.files import FILE_OBJECT
9
+ from core.files import FILE_SET
10
  from core.files import FILE_TYPES
11
  from core.files import RESOURCE_TYPES
12
  from core.record_sets import infer_record_sets
 
28
  _MANUAL_SHA256_KEY = "manual_object_sha256"
29
  _MANUAL_PARENT_KEY = "manual_object_parents"
30
 
31
+ _INFO = """Resources can be `FileObjects` (single files) or `FileSets` (sets of files
32
+ with the same MIME type). On this page, you can upload `FileObjects`, point to external
33
+ resources on the web or manually create new resources."""
34
+
35
 
36
  def render_files():
37
  col1, col2, col3 = st.columns([1, 1, 1], gap="small")
38
  with col1:
39
+ st.markdown("##### Upload more resources")
40
  _render_upload_panel()
41
  with col2:
42
+ st.markdown("##### Uploaded resources")
43
  files = st.session_state[Metadata].distribution
44
  resource = _render_resources_panel(files)
45
  st.session_state[SelectedResource] = resource
 
91
  st.text_input("URL:", key=_DISTANT_URL_KEY)
92
 
93
  with tab3:
94
+ st.selectbox("Type", options=RESOURCE_TYPES, key=_MANUAL_RESOURCE_TYPE_KEY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  def handle_on_click():
97
  url = st.session_state[_DISTANT_URL_KEY]
 
105
  file = file_from_upload(file_type, uploaded_file, names)
106
  else:
107
  resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
108
+ file = file_from_form(resource_type, names)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  st.session_state[Metadata].add_distribution(file)
111
  record_sets = infer_record_sets(file, names)
 
121
  of the selected resource."""
122
  if st.session_state.get(SelectedResource):
123
  _render_resource_details(st.session_state[SelectedResource])
124
+ else:
125
+ st.info(_INFO, icon="💡")
126
 
127
 
128
  def _render_resource_details(selected_file: Resource):
129
  """Renders the details of the selected resource."""
130
  file: FileObject | FileSet
131
+ for i, file in enumerate(st.session_state[Metadata].distribution):
132
  if file.name == selected_file.name:
133
+ is_file_object = isinstance(file, FileObject)
134
+ index = (
135
+ RESOURCE_TYPES.index(FILE_OBJECT)
136
+ if is_file_object
137
+ else RESOURCE_TYPES.index(FILE_SET)
138
+ )
139
+ key = f"{i}-file-name"
140
+ st.selectbox(
141
+ "Type",
142
+ index=index,
143
+ options=RESOURCE_TYPES,
144
+ key=key,
145
+ on_change=handle_resource_change,
146
+ args=(ResourceEvent.TYPE, file, key),
147
+ )
148
 
149
+ _render_resource(i, file, is_file_object)
 
 
 
150
 
151
  def delete_line():
152
+ st.session_state[Metadata].remove_distribution(i)
153
 
154
+ def close():
155
+ st.session_state[SelectedResource] = None
156
 
157
+ col1, col2 = st.columns([1, 1])
158
+ col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
159
+ col2.button(
160
+ "Remove", key=f"{i}_remove", on_click=delete_line, type="secondary"
161
+ )
162
 
163
+
164
+ def _render_resource(prefix: int, file: FileObject | FileSet, is_file_object: bool):
165
+ parent_options = [f.name for f in st.session_state[Metadata].distribution]
166
+ key = f"{prefix}_parents"
167
+ st.multiselect(
168
+ "Parents",
169
+ default=file.contained_in,
170
+ options=parent_options,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  key=key,
172
  on_change=handle_resource_change,
173
+ args=(ResourceEvent.CONTAINED_IN, file, key),
174
  )
 
 
 
 
 
 
 
 
175
  key = f"{prefix}_name"
176
  st.text_input(
177
  needed_field("Name"),
 
189
  on_change=handle_resource_change,
190
  args=(ResourceEvent.DESCRIPTION, file, key),
191
  )
192
+ if is_file_object:
193
+ key = f"{prefix}_content_url"
194
+ st.text_input(
195
+ needed_field("Content URL"),
196
+ value=file.content_url,
197
+ key=key,
198
+ on_change=handle_resource_change,
199
+ args=(ResourceEvent.CONTENT_URL, file, key),
200
+ )
201
+ key = f"{prefix}_sha256"
202
+ st.text_input(
203
+ needed_field("SHA256"),
204
+ value=file.sha256,
205
+ key=key,
206
+ on_change=handle_resource_change,
207
+ args=(ResourceEvent.SHA256, file, key),
208
+ )
209
+ key = f"{prefix}_content_size"
210
+ st.text_input(
211
+ "Content size",
212
+ value=file.content_size,
213
+ key=key,
214
+ on_change=handle_resource_change,
215
+ args=(ResourceEvent.CONTENT_SIZE, file, key),
216
+ )
217
+ else:
218
+ key = f"{prefix}_includes"
219
+ st.text_input(
220
+ needed_field("Glob pattern of files to include"),
221
+ value=file.includes,
222
+ key=key,
223
+ on_change=handle_resource_change,
224
+ args=(ResourceEvent.INCLUDES, file, key),
225
+ )
226
  key = f"{prefix}_encoding"
227
  st.text_input(
228
  needed_field("Encoding format"),
229
  value=file.encoding_format,
 
230
  key=key,
231
  on_change=handle_resource_change,
232
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
233
  )
234
+ if is_file_object:
235
+ st.markdown("First rows of data:")
236
+ is_url = file.content_url and file.content_url.startswith("http")
237
+ if file.df is not None:
238
+ st.dataframe(file.df, height=DF_HEIGHT)
239
+ elif is_url:
240
+ st.button("Trigger download")
241
+ else:
242
+ st.markdown("No rendering possible.")