marcenacp commited on
Commit
bc133ae
1 Parent(s): 8c11dd4

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
components/tabs/frontend/build/asset-manifest.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "files": {
3
- "main.js": "./static/js/main.e6b754d8.js",
4
  "index.html": "./index.html",
5
- "main.e6b754d8.js.map": "./static/js/main.e6b754d8.js.map"
6
  },
7
  "entrypoints": [
8
- "static/js/main.e6b754d8.js"
9
  ]
10
  }
 
1
  {
2
  "files": {
3
+ "main.js": "./static/js/main.a44b10fc.js",
4
  "index.html": "./index.html",
5
+ "main.a44b10fc.js.map": "./static/js/main.a44b10fc.js.map"
6
  },
7
  "entrypoints": [
8
+ "static/js/main.a44b10fc.js"
9
  ]
10
  }
components/tabs/frontend/build/index.html CHANGED
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><title>Streamlit Tabs Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.e6b754d8.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
 
1
+ <!doctype html><html lang="en"><head><title>Streamlit Tabs Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.a44b10fc.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
components/tabs/frontend/build/static/js/main.a44b10fc.js ADDED
The diff for this file is too large to render. See raw diff
 
components/tabs/frontend/build/static/js/main.a44b10fc.js.LICENSE.txt ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ object-assign
3
+ (c) Sindre Sorhus
4
+ @license MIT
5
+ */
6
+
7
+ /**
8
+ * @license React
9
+ * react-dom.production.min.js
10
+ *
11
+ * Copyright (c) Facebook, Inc. and its affiliates.
12
+ *
13
+ * This source code is licensed under the MIT license found in the
14
+ * LICENSE file in the root directory of this source tree.
15
+ */
16
+
17
+ /**
18
+ * @license React
19
+ * react-is.production.min.js
20
+ *
21
+ * Copyright (c) Facebook, Inc. and its affiliates.
22
+ *
23
+ * This source code is licensed under the MIT license found in the
24
+ * LICENSE file in the root directory of this source tree.
25
+ */
26
+
27
+ /**
28
+ * @license React
29
+ * react-jsx-runtime.production.min.js
30
+ *
31
+ * Copyright (c) Facebook, Inc. and its affiliates.
32
+ *
33
+ * This source code is licensed under the MIT license found in the
34
+ * LICENSE file in the root directory of this source tree.
35
+ */
36
+
37
+ /**
38
+ * @license React
39
+ * react.production.min.js
40
+ *
41
+ * Copyright (c) Facebook, Inc. and its affiliates.
42
+ *
43
+ * This source code is licensed under the MIT license found in the
44
+ * LICENSE file in the root directory of this source tree.
45
+ */
46
+
47
+ /**
48
+ * @license React
49
+ * scheduler.production.min.js
50
+ *
51
+ * Copyright (c) Facebook, Inc. and its affiliates.
52
+ *
53
+ * This source code is licensed under the MIT license found in the
54
+ * LICENSE file in the root directory of this source tree.
55
+ */
56
+
57
+ /** @license React v16.13.1
58
+ * react-is.production.min.js
59
+ *
60
+ * Copyright (c) Facebook, Inc. and its affiliates.
61
+ *
62
+ * This source code is licensed under the MIT license found in the
63
+ * LICENSE file in the root directory of this source tree.
64
+ */
65
+
66
+ /** @license React v16.14.0
67
+ * react.production.min.js
68
+ *
69
+ * Copyright (c) Facebook, Inc. and its affiliates.
70
+ *
71
+ * This source code is licensed under the MIT license found in the
72
+ * LICENSE file in the root directory of this source tree.
73
+ */
components/tabs/frontend/build/static/js/main.a44b10fc.js.map ADDED
The diff for this file is too large to render. See raw diff
 
components/tabs/frontend/src/Tabs.tsx CHANGED
@@ -84,7 +84,7 @@ function BasicTabs({
84
  whiteSpace: "nowrap",
85
  }}
86
  >
87
- Download 🥐 file
88
  </Button>
89
  </span>
90
  </Tooltip>
 
84
  whiteSpace: "nowrap",
85
  }}
86
  >
87
+ Export
88
  </Button>
89
  </span>
90
  </Tooltip>
core/constants.py CHANGED
@@ -35,3 +35,8 @@ METADATA = "Metadata"
35
  RESOURCES = "Resources"
36
  RECORD_SETS = "Record Sets"
37
  TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
 
 
 
 
 
 
35
  RESOURCES = "Resources"
36
  RECORD_SETS = "Record Sets"
37
  TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
38
+
39
+ NAMES_INFO = (
40
+ "Names are used as identifiers. They are unique and cannot contain special"
41
+ " characters. The interface will replace any special characters."
42
+ )
core/files.py CHANGED
@@ -204,7 +204,7 @@ def file_from_form(
204
  if type == FILE_OBJECT:
205
  return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
206
  elif type == FILE_SET:
207
- return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
208
  else:
209
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
210
 
 
204
  if type == FILE_OBJECT:
205
  return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
206
  elif type == FILE_SET:
207
+ return FileSet(name=find_unique_name(names, "file_set"))
208
  else:
209
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
210
 
core/names.py CHANGED
@@ -1,8 +1,13 @@
1
  """Module to handle naming of RecordSets and distribution."""
2
 
 
 
 
 
3
 
4
  def find_unique_name(names: set[str], name: str):
5
  """Find a unique UID."""
 
6
  while name in names:
7
  name = f"{name}_0"
8
  return name
 
1
  """Module to handle naming of RecordSets and distribution."""
2
 
3
+ import re
4
+
5
+ NAME_PATTERN_REGEX = "[^a-zA-Z0-9\\-_\\.]"
6
+
7
 
8
  def find_unique_name(names: set[str], name: str):
9
  """Find a unique UID."""
10
+ name = re.sub(NAME_PATTERN_REGEX, "_", name)
11
  while name in names:
12
  name = f"{name}_0"
13
  return name
core/names_test.py CHANGED
@@ -5,6 +5,7 @@ from .names import find_unique_name
5
 
6
  def test_find_unique_name():
7
  names = set(["first", "second", "first_0"])
 
8
  assert find_unique_name(names, "first") == "first_0_0"
9
  assert find_unique_name(names, "second") == "second_0"
10
  assert find_unique_name(names, "third") == "third"
 
5
 
6
  def test_find_unique_name():
7
  names = set(["first", "second", "first_0"])
8
+ assert find_unique_name(names, "are there spaces") == "are_there_spaces"
9
  assert find_unique_name(names, "first") == "first_0_0"
10
  assert find_unique_name(names, "second") == "second_0"
11
  assert find_unique_name(names, "third") == "third"
core/state.py CHANGED
@@ -183,11 +183,15 @@ class Metadata:
183
  name: str = ""
184
  description: str | None = None
185
  citation: str | None = None
 
 
186
  license: str | None = ""
 
187
  url: str = ""
188
  distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
189
  record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
190
  rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
 
191
 
192
  def __bool__(self):
193
  return self.name != "" and self.url != ""
 
183
  name: str = ""
184
  description: str | None = None
185
  citation: str | None = None
186
+ data_biases: str | None = None
187
+ data_collection: str | None = None
188
  license: str | None = ""
189
+ personal_sensitive_information: str | None = None
190
  url: str = ""
191
  distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
192
  record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
193
  rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
194
+ version: str | None = None
195
 
196
  def __bool__(self):
197
  return self.name != "" and self.url != ""
events/metadata.py CHANGED
@@ -2,6 +2,7 @@ import enum
2
 
3
  import streamlit as st
4
 
 
5
  from core.state import Metadata
6
 
7
  # List from:
@@ -93,11 +94,15 @@ class MetadataEvent(enum.Enum):
93
  URL = "URL"
94
  LICENSE = "LICENSE"
95
  CITATION = "CITATION"
 
 
 
 
96
 
97
 
98
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
99
  if event == MetadataEvent.NAME:
100
- metadata.name = st.session_state[key]
101
  elif event == MetadataEvent.DESCRIPTION:
102
  metadata.description = st.session_state[key]
103
  elif event == MetadataEvent.LICENSE:
@@ -106,3 +111,11 @@ def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
106
  metadata.citation = st.session_state[key]
107
  elif event == MetadataEvent.URL:
108
  metadata.url = st.session_state[key]
 
 
 
 
 
 
 
 
 
2
 
3
  import streamlit as st
4
 
5
+ from core.names import find_unique_name
6
  from core.state import Metadata
7
 
8
  # List from:
 
94
  URL = "URL"
95
  LICENSE = "LICENSE"
96
  CITATION = "CITATION"
97
+ VERSION = "VERSION"
98
+ DATA_BIASES = "DATA_BIASES"
99
+ DATA_COLLECTION = "DATA_COLLECTION"
100
+ PERSONAL_SENSITIVE_INFORMATION = "PERSONAL_SENSITIVE_INFORMATION"
101
 
102
 
103
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
104
  if event == MetadataEvent.NAME:
105
+ metadata.name = find_unique_name(set(), st.session_state[key])
106
  elif event == MetadataEvent.DESCRIPTION:
107
  metadata.description = st.session_state[key]
108
  elif event == MetadataEvent.LICENSE:
 
111
  metadata.citation = st.session_state[key]
112
  elif event == MetadataEvent.URL:
113
  metadata.url = st.session_state[key]
114
+ elif event == MetadataEvent.VERSION:
115
+ metadata.version = st.session_state[key]
116
+ elif event == MetadataEvent.DATA_BIASES:
117
+ metadata.data_biases = st.session_state[key]
118
+ elif event == MetadataEvent.DATA_COLLECTION:
119
+ metadata.data_collection = st.session_state[key]
120
+ elif event == MetadataEvent.PERSONAL_SENSITIVE_INFORMATION:
121
+ metadata.personal_sensitive_information = st.session_state[key]
views/files.py CHANGED
@@ -3,6 +3,7 @@ import streamlit as st
3
  from components.safe_button import button_with_confirmation
4
  from components.tree import render_tree
5
  from core.constants import DF_HEIGHT
 
6
  from core.constants import OAUTH_CLIENT_ID
7
  from core.files import code_to_index
8
  from core.files import file_from_form
@@ -39,16 +40,15 @@ resources on the web or manually create new resources."""
39
  def render_files():
40
  """Renders the views of the files: warnings and panels to display information."""
41
  _render_warnings()
42
- col1, col2, col3 = st.columns([1, 1, 1], gap="small")
43
  with col1:
44
- st.markdown("##### Upload more resources")
45
  _render_upload_panel()
46
- with col2:
47
  st.markdown("##### Uploaded resources")
48
  files = st.session_state[Metadata].distribution
49
  resource = _render_resources_panel(files)
50
  st.session_state[SelectedResource] = resource
51
- with col3:
52
  _render_right_panel()
53
 
54
 
@@ -111,9 +111,7 @@ def _render_resources_panel(files: list[Resource]) -> Resource | None:
111
  def _render_upload_panel():
112
  """Renders the form to upload from local or upload from URL."""
113
  with st.form(key="upload_form", clear_on_submit=True):
114
- tab1, tab2, tab3 = st.tabs([
115
- "Import from a local file", "Import from a URL", "Add manually"
116
- ])
117
 
118
  with tab1:
119
  st.file_uploader("Select a file", key=_LOCAL_FILE_KEY)
@@ -202,6 +200,11 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
202
  default=file.contained_in,
203
  options=parent_options,
204
  key=key,
 
 
 
 
 
205
  on_change=handle_resource_change,
206
  args=(ResourceEvent.CONTAINED_IN, file, key),
207
  )
@@ -210,6 +213,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
210
  needed_field("Name"),
211
  value=file.name,
212
  key=key,
 
213
  on_change=handle_resource_change,
214
  args=(ResourceEvent.NAME, file, key),
215
  )
@@ -217,7 +221,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
217
  st.text_area(
218
  "Description",
219
  value=file.description,
220
- placeholder="Provide a clear description of the file.",
221
  key=key,
222
  on_change=handle_resource_change,
223
  args=(ResourceEvent.DESCRIPTION, file, key),
@@ -225,9 +229,10 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
225
  if is_file_object:
226
  key = f"{prefix}_content_url"
227
  st.text_input(
228
- needed_field("Content URL"),
229
  value=file.content_url,
230
  key=key,
 
231
  on_change=handle_resource_change,
232
  args=(ResourceEvent.CONTENT_URL, file, key),
233
  )
@@ -244,6 +249,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
244
  "Content size",
245
  value=file.content_size,
246
  key=key,
 
247
  on_change=handle_resource_change,
248
  args=(ResourceEvent.CONTENT_SIZE, file, key),
249
  )
@@ -262,6 +268,10 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
262
  index=code_to_index(file.encoding_format),
263
  options=FILE_TYPES.keys(),
264
  key=key,
 
 
 
 
265
  on_change=handle_resource_change,
266
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
267
  )
 
3
  from components.safe_button import button_with_confirmation
4
  from components.tree import render_tree
5
  from core.constants import DF_HEIGHT
6
+ from core.constants import NAMES_INFO
7
  from core.constants import OAUTH_CLIENT_ID
8
  from core.files import code_to_index
9
  from core.files import file_from_form
 
40
  def render_files():
41
  """Renders the views of the files: warnings and panels to display information."""
42
  _render_warnings()
43
+ col1, col2 = st.columns([1, 1], gap="small")
44
  with col1:
45
+ st.markdown("##### Add a resource")
46
  _render_upload_panel()
 
47
  st.markdown("##### Uploaded resources")
48
  files = st.session_state[Metadata].distribution
49
  resource = _render_resources_panel(files)
50
  st.session_state[SelectedResource] = resource
51
+ with col2:
52
  _render_right_panel()
53
 
54
 
 
111
  def _render_upload_panel():
112
  """Renders the form to upload from local or upload from URL."""
113
  with st.form(key="upload_form", clear_on_submit=True):
114
+ tab1, tab2, tab3 = st.tabs(["From a local file", "From a URL", "Add manually"])
 
 
115
 
116
  with tab1:
117
  st.file_uploader("Select a file", key=_LOCAL_FILE_KEY)
 
200
  default=file.contained_in,
201
  options=parent_options,
202
  key=key,
203
+ help=(
204
+ "FileObjects and FileSets can be nested. Specifying `Parents` allows to"
205
+ " nest a FileObject/FileSet within another FileObject/FileSet. An example"
206
+ " of this is when images (FileSet) are nested within an archive (FileSet)."
207
+ ),
208
  on_change=handle_resource_change,
209
  args=(ResourceEvent.CONTAINED_IN, file, key),
210
  )
 
213
  needed_field("Name"),
214
  value=file.name,
215
  key=key,
216
+ help=f"The name of the resource. {NAMES_INFO}",
217
  on_change=handle_resource_change,
218
  args=(ResourceEvent.NAME, file, key),
219
  )
 
221
  st.text_area(
222
  "Description",
223
  value=file.description,
224
+ placeholder="Provide a description of the file.",
225
  key=key,
226
  on_change=handle_resource_change,
227
  args=(ResourceEvent.DESCRIPTION, file, key),
 
229
  if is_file_object:
230
  key = f"{prefix}_content_url"
231
  st.text_input(
232
+ needed_field("Content URL or local path"),
233
  value=file.content_url,
234
  key=key,
235
+ help="The URL or local file path pointing to the original FileObject.",
236
  on_change=handle_resource_change,
237
  args=(ResourceEvent.CONTENT_URL, file, key),
238
  )
 
249
  "Content size",
250
  value=file.content_size,
251
  key=key,
252
+ help="The size of the original FileObject in bytes.",
253
  on_change=handle_resource_change,
254
  args=(ResourceEvent.CONTENT_SIZE, file, key),
255
  )
 
268
  index=code_to_index(file.encoding_format),
269
  options=FILE_TYPES.keys(),
270
  key=key,
271
+ help=(
272
+ "MIME type corresponding to"
273
+ " ([sc:encodingFormat](https://schema.org/encodingFormat))."
274
+ ),
275
  on_change=handle_resource_change,
276
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
277
  )
views/load.py CHANGED
@@ -30,5 +30,5 @@ def _on_file_upload(key):
30
  def render_load():
31
  key = "json-ld-file-upload"
32
  st.file_uploader(
33
- "Select a JSON-LD", type="json", key=key, on_change=_on_file_upload, args=(key,)
34
  )
 
30
  def render_load():
31
  key = "json-ld-file-upload"
32
  st.file_uploader(
33
+ "Drop a JSON-LD", type="json", key=key, on_change=_on_file_upload, args=(key,)
34
  )
views/metadata.py CHANGED
@@ -10,7 +10,57 @@ from events.metadata import MetadataEvent
10
 
11
  def render_metadata():
12
  """Renders the `Metadata` view."""
13
- metadata = st.session_state[Metadata]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  index = find_license_index(metadata.license)
15
  key = "metadata-url"
16
  st.text_input(
@@ -21,6 +71,19 @@ def render_metadata():
21
  on_change=handle_metadata_change,
22
  args=(MetadataEvent.URL, metadata, key),
23
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  key = "metadata-license"
25
  st.selectbox(
26
  label="License",
 
10
 
11
  def render_metadata():
12
  """Renders the `Metadata` view."""
13
+ metadata: Metadata = st.session_state[Metadata]
14
+ col1, col2 = st.columns([1, 1])
15
+ with col1.expander("**Generic metadata**", expanded=True):
16
+ _render_generic_metadata(metadata)
17
+ with col2.expander("**Responsible AI (RAI) metadata**", expanded=True):
18
+ _render_rai_metadata(metadata)
19
+
20
+
21
+ def _render_rai_metadata(metadata: Metadata):
22
+ """Renders RAI (Responsible AI) metadata."""
23
+ key = "metadata-data-collection"
24
+ st.text_area(
25
+ label=(
26
+ "**Data collection**. Key stages of the data collection process encourage"
27
+ " its creators to reflect on the process and improves understanding for"
28
+ " users."
29
+ ),
30
+ key=key,
31
+ value=metadata.data_collection,
32
+ on_change=handle_metadata_change,
33
+ args=(MetadataEvent.DATA_COLLECTION, metadata, key),
34
+ )
35
+ key = "metadata-data-biases"
36
+ st.text_area(
37
+ label=(
38
+ "**Data biases**. Involves understanding the potential risks associated"
39
+ " with data usage and to prevent unintended and potentially harmful"
40
+ " consequences that may arise from using models trained on or evaluated"
41
+ " with the respective data."
42
+ ),
43
+ key=key,
44
+ value=metadata.data_biases,
45
+ on_change=handle_metadata_change,
46
+ args=(MetadataEvent.DATA_BIASES, metadata, key),
47
+ )
48
+ key = "metadata-personal-sensitive-information"
49
+ st.text_area(
50
+ label=(
51
+ "**Personal sensitive information**. Personal and sensitive information, if"
52
+ " contained within the dataset, can play an important role in the"
53
+ " mitigation of any risks and the responsible use of the datasets."
54
+ ),
55
+ key=key,
56
+ value=metadata.personal_sensitive_information,
57
+ on_change=handle_metadata_change,
58
+ args=(MetadataEvent.PERSONAL_SENSITIVE_INFORMATION, metadata, key),
59
+ )
60
+
61
+
62
+ def _render_generic_metadata(metadata: Metadata):
63
+ """Renders all non-RAI generic metadata."""
64
  index = find_license_index(metadata.license)
65
  key = "metadata-url"
66
  st.text_input(
 
71
  on_change=handle_metadata_change,
72
  args=(MetadataEvent.URL, metadata, key),
73
  )
74
+ key = "metadata-version"
75
+ st.text_input(
76
+ label="Version (`MAJOR.MINOR.PATCH`)",
77
+ key=key,
78
+ help=(
79
+ "Refer to https://semver.org/spec/v2.0.0.html for more information on the"
80
+ " format."
81
+ ),
82
+ value=metadata.version,
83
+ placeholder="1.0.0",
84
+ on_change=handle_metadata_change,
85
+ args=(MetadataEvent.VERSION, metadata, key),
86
+ )
87
  key = "metadata-license"
88
  st.selectbox(
89
  label="License",
views/overview.py CHANGED
@@ -3,6 +3,7 @@ from typing import Any
3
 
4
  import streamlit as st
5
 
 
6
  from core.state import Metadata
7
  import mlcroissant as mlc
8
  from utils import needed_field
@@ -22,9 +23,9 @@ _INFO_TEXT = """Croissant files are composed of three layers:
22
  (typically a file or set of files) and the structure of these records,
23
  expressed as a set of fields (e.g., the columns of a table).
24
 
25
- The next three tabs will guide you through filling those layers. The errors if any will
26
- be displayed on this page. Once you are ready, you can download the dataset by clicking
27
- the export button in the upper right corner."""
28
 
29
 
30
  def _relevant_fields(class_or_instance: type):
@@ -51,6 +52,7 @@ def render_overview():
51
  label=needed_field("Name"),
52
  key=key,
53
  value=metadata.name,
 
54
  placeholder="Dataset",
55
  on_change=handle_metadata_change,
56
  args=(MetadataEvent.NAME, metadata, key),
@@ -62,7 +64,7 @@ def render_overview():
62
  label="Description",
63
  key=key,
64
  value=metadata.description,
65
- placeholder="Provide a clear description of the dataset.",
66
  on_change=handle_metadata_change,
67
  args=(MetadataEvent.DESCRIPTION, metadata, key),
68
  )
@@ -82,10 +84,17 @@ def render_overview():
82
  * 100
83
  / (3 * metadata_weight)
84
  )
85
- col_a.metric("Completion", f"{completion}%")
86
- col_b.metric("Number of metadata fields", fields)
87
- col_c.metric("Number of resources", len(metadata.distribution))
88
- col_d.metric("Number of RecordSets", len(metadata.record_sets))
 
 
 
 
 
 
 
89
  with col2:
90
  user_started_editing = metadata.record_sets or metadata.distribution
91
  if user_started_editing:
 
3
 
4
  import streamlit as st
5
 
6
+ from core.constants import NAMES_INFO
7
  from core.state import Metadata
8
  import mlcroissant as mlc
9
  from utils import needed_field
 
23
  (typically a file or set of files) and the structure of these records,
24
  expressed as a set of fields (e.g., the columns of a table).
25
 
26
+ The next three tabs will guide you through filling those layers. Any error will be
27
+ displayed on the overview. Once the dataset is finished, you can download the dataset by
28
+ clicking the export button in the upper right corner."""
29
 
30
 
31
  def _relevant_fields(class_or_instance: type):
 
52
  label=needed_field("Name"),
53
  key=key,
54
  value=metadata.name,
55
+ help=f"The name of the dataset. {NAMES_INFO}",
56
  placeholder="Dataset",
57
  on_change=handle_metadata_change,
58
  args=(MetadataEvent.NAME, metadata, key),
 
64
  label="Description",
65
  key=key,
66
  value=metadata.description,
67
+ placeholder="Provide a description of the dataset.",
68
  on_change=handle_metadata_change,
69
  args=(MetadataEvent.DESCRIPTION, metadata, key),
70
  )
 
84
  * 100
85
  / (3 * metadata_weight)
86
  )
87
+ col_a.metric(
88
+ "Completion",
89
+ f"{completion}%",
90
+ help=(
91
+ "Approximation of the total completion based on the number of fields"
92
+ " that are filled."
93
+ ),
94
+ )
95
+ col_b.metric("Metadata fields", fields)
96
+ col_c.metric("Resources", len(metadata.distribution))
97
+ col_d.metric("RecordSets", len(metadata.record_sets))
98
  with col2:
99
  user_started_editing = metadata.record_sets or metadata.distribution
100
  if user_started_editing:
views/previous_files.py CHANGED
@@ -50,4 +50,4 @@ def render_previous_files():
50
  except:
51
  pass
52
  if has_no_project:
53
- st.write("No past project to load. Create one on the left!")
 
50
  except:
51
  pass
52
  if has_no_project:
53
+ st.write("No recent project to load. Create one on the left!")
views/record_sets.py CHANGED
@@ -10,6 +10,7 @@ from rdflib import term
10
  import streamlit as st
11
 
12
  from components.safe_button import button_with_confirmation
 
13
  from core.data_types import MLC_DATA_TYPES
14
  from core.data_types import mlc_to_str_data_type
15
  from core.data_types import STR_DATA_TYPES
@@ -240,6 +241,7 @@ def _render_left_panel():
240
  needed_field("Name"),
241
  placeholder="Name without special character.",
242
  key=key,
 
243
  value=record_set.name,
244
  on_change=handle_record_set_change,
245
  args=(RecordSetEvent.NAME, record_set, key),
@@ -247,7 +249,7 @@ def _render_left_panel():
247
  key = f"{prefix}-description"
248
  col2.text_input(
249
  "Description",
250
- placeholder="Provide a clear description of the RecordSet.",
251
  key=key,
252
  value=record_set.description,
253
  on_change=handle_record_set_change,
@@ -257,6 +259,13 @@ def _render_left_panel():
257
  st.checkbox(
258
  "The RecordSet is an enumeration",
259
  key=key,
 
 
 
 
 
 
 
260
  value=record_set.is_enumeration,
261
  on_change=handle_record_set_change,
262
  args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
@@ -265,6 +274,10 @@ def _render_left_panel():
265
  st.checkbox(
266
  "The RecordSet has in-line data",
267
  key=key,
 
 
 
 
268
  value=bool(record_set.data),
269
  on_change=handle_record_set_change,
270
  args=(RecordSetEvent.HAS_DATA, record_set, key),
@@ -324,8 +337,14 @@ def _render_left_panel():
324
  )
325
  data_editor_key = _data_editor_key(record_set_key, record_set)
326
  st.markdown(
327
- f"{needed_field('Fields')} (add/delete fields by directly editing the"
328
- " table)"
 
 
 
 
 
 
329
  )
330
  st.data_editor(
331
  fields,
@@ -437,6 +456,7 @@ def _render_right_panel():
437
  needed_field("Name"),
438
  placeholder="Name without special character.",
439
  key=key,
 
440
  value=field.name,
441
  on_change=handle_field_change,
442
  args=(FieldEvent.NAME, field, key),
@@ -444,38 +464,35 @@ def _render_right_panel():
444
  key = f"{prefix}-description"
445
  col2.text_input(
446
  "Description",
447
- placeholder="Provide a clear description of the RecordSet.",
448
  key=key,
449
  on_change=handle_field_change,
450
  value=field.description,
451
  args=(FieldEvent.DESCRIPTION, field, key),
452
  )
 
453
  if field.data_types:
454
  data_type = field.data_types[0]
455
  if isinstance(data_type, str):
456
  data_type = term.URIRef(data_type)
457
  if data_type in MLC_DATA_TYPES:
458
  data_type_index = MLC_DATA_TYPES.index(data_type)
459
- else:
460
- data_type_index = None
461
- else:
462
- data_type_index = None
463
  key = f"{prefix}-datatypes"
464
  col3.selectbox(
465
  needed_field("Data type"),
466
  index=data_type_index,
467
  options=STR_DATA_TYPES,
468
  key=key,
 
 
 
 
469
  on_change=handle_field_change,
470
  args=(FieldEvent.DATA_TYPE, field, key),
471
  )
472
  possible_sources = _get_possible_sources(metadata)
473
- render_source(
474
- record_set_key, record_set, field, field_key, possible_sources
475
- )
476
- render_references(
477
- record_set_key, record_set, field, field_key, possible_sources
478
- )
479
 
480
  st.divider()
481
 
 
10
  import streamlit as st
11
 
12
  from components.safe_button import button_with_confirmation
13
+ from core.constants import NAMES_INFO
14
  from core.data_types import MLC_DATA_TYPES
15
  from core.data_types import mlc_to_str_data_type
16
  from core.data_types import STR_DATA_TYPES
 
241
  needed_field("Name"),
242
  placeholder="Name without special character.",
243
  key=key,
244
+ help=f"The name of the RecordSet. {NAMES_INFO}",
245
  value=record_set.name,
246
  on_change=handle_record_set_change,
247
  args=(RecordSetEvent.NAME, record_set, key),
 
249
  key = f"{prefix}-description"
250
  col2.text_input(
251
  "Description",
252
+ placeholder="Provide a description of the RecordSet.",
253
  key=key,
254
  value=record_set.description,
255
  on_change=handle_record_set_change,
 
259
  st.checkbox(
260
  "The RecordSet is an enumeration",
261
  key=key,
262
+ help=(
263
+ "Enumerations indicate that the RecordSet takes its values from a"
264
+ " finite set. Similar to `ClassLabel` in"
265
+ " [TFDS](https://www.tensorflow.org/datasets/api_docs/python/tfds/features/ClassLabel)"
266
+ " or [Hugging"
267
+ " Face](https://huggingface.co/docs/datasets/v2.15.0/en/package_reference/main_classes#datasets.ClassLabel)."
268
+ ),
269
  value=record_set.is_enumeration,
270
  on_change=handle_record_set_change,
271
  args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
 
274
  st.checkbox(
275
  "The RecordSet has in-line data",
276
  key=key,
277
+ help=(
278
+ "In-line data allows to embed data directly within the JSON-LD"
279
+ " without referencing another data source."
280
+ ),
281
  value=bool(record_set.data),
282
  on_change=handle_record_set_change,
283
  args=(RecordSetEvent.HAS_DATA, record_set, key),
 
337
  )
338
  data_editor_key = _data_editor_key(record_set_key, record_set)
339
  st.markdown(
340
+ needed_field("Fields"),
341
+ help=(
342
+ "Add/delete fields by directly editing the table. **Warning**: the"
343
+ " table contains information about the fields--not the data"
344
+ " directly. If you wish to embed data, tick the `The RecordSet is"
345
+ " an enumeration` box. To edit fields details, click the"
346
+ " button `Edit fields details` below."
347
+ ),
348
  )
349
  st.data_editor(
350
  fields,
 
456
  needed_field("Name"),
457
  placeholder="Name without special character.",
458
  key=key,
459
+ help=f"The name of the field. {NAMES_INFO}",
460
  value=field.name,
461
  on_change=handle_field_change,
462
  args=(FieldEvent.NAME, field, key),
 
464
  key = f"{prefix}-description"
465
  col2.text_input(
466
  "Description",
467
+ placeholder="Provide a description of the RecordSet.",
468
  key=key,
469
  on_change=handle_field_change,
470
  value=field.description,
471
  args=(FieldEvent.DESCRIPTION, field, key),
472
  )
473
+ data_type_index = None
474
  if field.data_types:
475
  data_type = field.data_types[0]
476
  if isinstance(data_type, str):
477
  data_type = term.URIRef(data_type)
478
  if data_type in MLC_DATA_TYPES:
479
  data_type_index = MLC_DATA_TYPES.index(data_type)
 
 
 
 
480
  key = f"{prefix}-datatypes"
481
  col3.selectbox(
482
  needed_field("Data type"),
483
  index=data_type_index,
484
  options=STR_DATA_TYPES,
485
  key=key,
486
+ help=(
487
+ "The type of the data. `Text` corresponds to"
488
+ " https://schema.org/Text, etc."
489
+ ),
490
  on_change=handle_field_change,
491
  args=(FieldEvent.DATA_TYPE, field, key),
492
  )
493
  possible_sources = _get_possible_sources(metadata)
494
+ render_source(record_set, field, possible_sources)
495
+ render_references(record_set, field, possible_sources)
 
 
 
 
496
 
497
  st.divider()
498
 
views/source.py CHANGED
@@ -12,6 +12,15 @@ from events.fields import TransformType
12
  import mlcroissant as mlc
13
  from utils import needed_field
14
 
 
 
 
 
 
 
 
 
 
15
 
16
  class SourceType:
17
  """The type of the source (distribution or field)."""
@@ -105,10 +114,8 @@ def _handle_remove_reference(field):
105
 
106
 
107
  def render_source(
108
- record_set_key: int,
109
  record_set: RecordSet,
110
  field: Field,
111
- field_key: int,
112
  possible_sources: list[str],
113
  ):
114
  """Renders the form for the source."""
@@ -123,10 +130,13 @@ def render_source(
123
  index = None
124
  key = f"{prefix}-source"
125
  col1.selectbox(
126
- needed_field("Source"),
127
  index=index,
128
  options=options,
129
  key=key,
 
 
 
130
  on_change=handle_field_change,
131
  args=(FieldEvent.SOURCE, field, key),
132
  )
@@ -135,6 +145,7 @@ def render_source(
135
  needed_field("Extract"),
136
  index=_get_extract_index(source),
137
  key=f"{prefix}-extract",
 
138
  options=EXTRACT_TYPES,
139
  on_change=handle_field_change,
140
  args=(FieldEvent.SOURCE_EXTRACT, field, key),
@@ -145,6 +156,7 @@ def render_source(
145
  needed_field("Column name"),
146
  value=source.extract.column,
147
  key=key,
 
148
  on_change=handle_field_change,
149
  args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
150
  )
@@ -154,6 +166,7 @@ def render_source(
154
  needed_field("JSON path"),
155
  value=source.extract.json_path,
156
  key=key,
 
157
  on_change=handle_field_change,
158
  args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
159
  )
@@ -170,18 +183,23 @@ def render_source(
170
  key=key,
171
  options=TRANSFORM_TYPES,
172
  on_change=handle_field_change,
 
173
  args=(FieldEvent.TRANSFORM, field, key),
174
  kwargs={"number": number},
175
  )
176
  if selected == TransformType.FORMAT:
177
  key = f"{prefix}-{number}-transform-format"
178
  col3.text_input(
179
- needed_field("Format"),
180
  value=transform.format,
181
  key=key,
182
  on_change=handle_field_change,
 
 
 
 
183
  args=(selected, field, key),
184
- kwargs={"number": number, "type": "format"},
185
  )
186
  elif selected == TransformType.JSON_PATH:
187
  key = f"{prefix}-{number}-jsonpath"
@@ -190,8 +208,9 @@ def render_source(
190
  value=transform.json_path,
191
  key=key,
192
  on_change=handle_field_change,
 
193
  args=(selected, field, key),
194
- kwargs={"number": number, "type": "format"},
195
  )
196
  elif selected == TransformType.REGEX:
197
  key = f"{prefix}-{number}-regex"
@@ -200,8 +219,14 @@ def render_source(
200
  value=transform.regex,
201
  key=key,
202
  on_change=handle_field_change,
 
 
 
 
 
 
203
  args=(selected, field, key),
204
- kwargs={"number": number, "type": "format"},
205
  )
206
  elif selected == TransformType.REPLACE:
207
  key = f"{prefix}-{number}-replace"
@@ -210,8 +235,13 @@ def render_source(
210
  value=transform.replace,
211
  key=key,
212
  on_change=handle_field_change,
 
 
 
 
 
213
  args=(selected, field, key),
214
- kwargs={"number": number, "type": "format"},
215
  )
216
  elif selected == TransformType.SEPARATOR:
217
  key = f"{prefix}-{number}-separator"
@@ -220,8 +250,9 @@ def render_source(
220
  value=transform.separator,
221
  key=key,
222
  on_change=handle_field_change,
 
223
  args=(selected, field, key),
224
- kwargs={"number": number, "type": "format"},
225
  )
226
 
227
  def _handle_remove_transform(field, number):
@@ -230,6 +261,7 @@ def render_source(
230
  col4.button(
231
  "✖️",
232
  key=f"{prefix}-{number}-remove-transform",
 
233
  on_click=_handle_remove_transform,
234
  args=(field, number),
235
  )
@@ -243,16 +275,15 @@ def render_source(
243
  col1.button(
244
  "Add transform on data",
245
  key=f"{prefix}-close-fields",
 
246
  on_click=_handle_add_transform,
247
  args=(field,),
248
  )
249
 
250
 
251
  def render_references(
252
- record_set_key: int,
253
  record_set: RecordSet,
254
  field: Field,
255
- field_key: int,
256
  possible_sources: list[str],
257
  ):
258
  """Renders the form for references."""
@@ -286,6 +317,7 @@ def render_references(
286
  index=_get_extract_index(references),
287
  key=key,
288
  options=EXTRACT_TYPES,
 
289
  on_change=handle_field_change,
290
  args=(FieldEvent.REFERENCE_EXTRACT, field, key),
291
  )
@@ -295,6 +327,7 @@ def render_references(
295
  needed_field("Column name"),
296
  value=references.extract.column,
297
  key=key,
 
298
  on_change=handle_field_change,
299
  args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
300
  )
@@ -304,12 +337,14 @@ def render_references(
304
  needed_field("JSON path"),
305
  value=references.extract.json_path,
306
  key=key,
 
307
  on_change=handle_field_change,
308
  args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
309
  )
310
  col4.button(
311
  "✖️",
312
  key=f"{key}-remove-reference",
 
313
  on_click=_handle_remove_reference,
314
  args=(field,),
315
  )
 
12
  import mlcroissant as mlc
13
  from utils import needed_field
14
 
15
+ _JSON_PATH_DOCUMENTATION = (
16
+ "The JSON path if the data source is a JSON (see"
17
+ " [documentation](https://www.ietf.org/archive/id/draft-goessner-dispatch-jsonpath-00.html))."
18
+ )
19
+ _EXTRACT_DOCUMENTATION = (
20
+ "The extraction method to get the value of the field (column in a CSV, etc)."
21
+ )
22
+ _COLUMN_NAME_DOCUMENTATION = "The name of the column if the data source is a CSV."
23
+
24
 
25
  class SourceType:
26
  """The type of the source (distribution or field)."""
 
114
 
115
 
116
  def render_source(
 
117
  record_set: RecordSet,
118
  field: Field,
 
119
  possible_sources: list[str],
120
  ):
121
  """Renders the form for the source."""
 
130
  index = None
131
  key = f"{prefix}-source"
132
  col1.selectbox(
133
+ needed_field("Data source"),
134
  index=index,
135
  options=options,
136
  key=key,
137
+ help=(
138
+ "Data sources can be other resources (FileObject, FileSet) or other fields."
139
+ ),
140
  on_change=handle_field_change,
141
  args=(FieldEvent.SOURCE, field, key),
142
  )
 
145
  needed_field("Extract"),
146
  index=_get_extract_index(source),
147
  key=f"{prefix}-extract",
148
+ help=_EXTRACT_DOCUMENTATION,
149
  options=EXTRACT_TYPES,
150
  on_change=handle_field_change,
151
  args=(FieldEvent.SOURCE_EXTRACT, field, key),
 
156
  needed_field("Column name"),
157
  value=source.extract.column,
158
  key=key,
159
+ help=_COLUMN_NAME_DOCUMENTATION,
160
  on_change=handle_field_change,
161
  args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
162
  )
 
166
  needed_field("JSON path"),
167
  value=source.extract.json_path,
168
  key=key,
169
+ help=_JSON_PATH_DOCUMENTATION,
170
  on_change=handle_field_change,
171
  args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
172
  )
 
183
  key=key,
184
  options=TRANSFORM_TYPES,
185
  on_change=handle_field_change,
186
+ help="One or more transformations to apply after extracting the field.",
187
  args=(FieldEvent.TRANSFORM, field, key),
188
  kwargs={"number": number},
189
  )
190
  if selected == TransformType.FORMAT:
191
  key = f"{prefix}-{number}-transform-format"
192
  col3.text_input(
193
+ needed_field("Format a date"),
194
  value=transform.format,
195
  key=key,
196
  on_change=handle_field_change,
197
+ help=(
198
+ "For dates, use [`Python format"
199
+ " codes`](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)."
200
+ ),
201
  args=(selected, field, key),
202
+ kwargs={"number": number},
203
  )
204
  elif selected == TransformType.JSON_PATH:
205
  key = f"{prefix}-{number}-jsonpath"
 
208
  value=transform.json_path,
209
  key=key,
210
  on_change=handle_field_change,
211
+ help=_JSON_PATH_DOCUMENTATION,
212
  args=(selected, field, key),
213
+ kwargs={"number": number},
214
  )
215
  elif selected == TransformType.REGEX:
216
  key = f"{prefix}-{number}-regex"
 
219
  value=transform.regex,
220
  key=key,
221
  on_change=handle_field_change,
222
+ help=(
223
+ "A regular expression following [`re` Python"
224
+ " convention](https://docs.python.org/3/library/re.html#regular-expression-syntax)"
225
+ " with one capturing group. The result of the operation will be"
226
+ " the last captured group."
227
+ ),
228
  args=(selected, field, key),
229
+ kwargs={"number": number},
230
  )
231
  elif selected == TransformType.REPLACE:
232
  key = f"{prefix}-{number}-replace"
 
235
  value=transform.replace,
236
  key=key,
237
  on_change=handle_field_change,
238
+ help=(
239
+ "A replace pattern separated by a `/`, i.e."
240
+ " `string_to_replace/string_to_substitute` in order to replace"
241
+ " `string_to_replace` by `string_to_substitute`."
242
+ ),
243
  args=(selected, field, key),
244
+ kwargs={"number": number},
245
  )
246
  elif selected == TransformType.SEPARATOR:
247
  key = f"{prefix}-{number}-separator"
 
250
  value=transform.separator,
251
  key=key,
252
  on_change=handle_field_change,
253
+ help="A separator to split strings on, e.g. `|` to split `a|b|c`.",
254
  args=(selected, field, key),
255
+ kwargs={"number": number},
256
  )
257
 
258
  def _handle_remove_transform(field, number):
 
261
  col4.button(
262
  "✖️",
263
  key=f"{prefix}-{number}-remove-transform",
264
+ help="Remove the transformation.",
265
  on_click=_handle_remove_transform,
266
  args=(field, number),
267
  )
 
275
  col1.button(
276
  "Add transform on data",
277
  key=f"{prefix}-close-fields",
278
+ help="Add a transformation.",
279
  on_click=_handle_add_transform,
280
  args=(field,),
281
  )
282
 
283
 
284
  def render_references(
 
285
  record_set: RecordSet,
286
  field: Field,
 
287
  possible_sources: list[str],
288
  ):
289
  """Renders the form for references."""
 
317
  index=_get_extract_index(references),
318
  key=key,
319
  options=EXTRACT_TYPES,
320
+ help=_EXTRACT_DOCUMENTATION,
321
  on_change=handle_field_change,
322
  args=(FieldEvent.REFERENCE_EXTRACT, field, key),
323
  )
 
327
  needed_field("Column name"),
328
  value=references.extract.column,
329
  key=key,
330
+ help=_COLUMN_NAME_DOCUMENTATION,
331
  on_change=handle_field_change,
332
  args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
333
  )
 
337
  needed_field("JSON path"),
338
  value=references.extract.json_path,
339
  key=key,
340
+ help=_JSON_PATH_DOCUMENTATION,
341
  on_change=handle_field_change,
342
  args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
343
  )
344
  col4.button(
345
  "✖️",
346
  key=f"{key}-remove-reference",
347
+ help="Remove the join.",
348
  on_click=_handle_remove_reference,
349
  args=(field,),
350
  )
views/splash.py CHANGED
@@ -13,6 +13,8 @@ import mlcroissant as mlc
13
  from views.load import render_load
14
  from views.previous_files import render_previous_files
15
 
 
 
16
  _DATASETS = {
17
  "Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
18
  "FLORES-200": [],
@@ -23,8 +25,23 @@ _DATASETS = {
23
  "Bigcode-The-Stack": [],
24
  }
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  def render_splash():
 
28
  if OAUTH_CLIENT_ID:
29
  st.info(
30
  "**Disclaimer**: Do not put sensitive information or datasets here. The"
@@ -34,9 +51,7 @@ def render_splash():
34
  )
35
  col1, col2 = st.columns([1, 1], gap="large")
36
  with col1:
37
- with st.expander("**Load an existing Croissant JSON-LD file**", expanded=True):
38
- render_load()
39
- with st.expander("**Create from scratch**", expanded=True):
40
 
41
  def create_new_croissant():
42
  st.session_state[Metadata] = Metadata()
@@ -47,7 +62,7 @@ def render_splash():
47
  on_click=create_new_croissant,
48
  type="primary",
49
  )
50
- with st.expander("**Try out an example!**", expanded=True):
51
 
52
  def create_example(dataset: str):
53
  base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
@@ -72,7 +87,7 @@ def render_splash():
72
  )
73
 
74
  dataset = st.selectbox(
75
- label="Dataset",
76
  options=_DATASETS.keys(),
77
  )
78
  st.button(
@@ -81,6 +96,28 @@ def render_splash():
81
  type="primary",
82
  args=(dataset,),
83
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  with col2:
85
- with st.expander("**Past projects**", expanded=True):
86
  render_previous_files()
 
13
  from views.load import render_load
14
  from views.previous_files import render_previous_files
15
 
16
+ _HUGGING_FACE_URL = "https://huggingface.co/datasets/"
17
+
18
  _DATASETS = {
19
  "Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
20
  "FLORES-200": [],
 
25
  "Bigcode-The-Stack": [],
26
  }
27
 
28
+ _INFO = """[Croissant](https://mlcommons.org/croissant) 🥐 is a high-level format for
29
+ machine learning datasets built
30
+ on [schema.org](https://schema.org/) and its Dataset vocabulary. A croissant
31
+ configuration file combines metadata, resource file descriptions, data structure, and
32
+ default ML semantics of dataset. You can familiarize yourself with the editor by
33
+ exploring the provided examples.
34
+
35
+ The editor supports creating a new configuration from scratch, as well as uploading
36
+ an existing Croissant JSON-MD file. Finally, you can also select any of your
37
+ past projects from the list.
38
+
39
+ You can change the project you are currently editing at any time by clicking
40
+ the Menu button and then choosing one of the options on this page."""
41
+
42
 
43
  def render_splash():
44
+ st.info(_INFO, icon="💡")
45
  if OAUTH_CLIENT_ID:
46
  st.info(
47
  "**Disclaimer**: Do not put sensitive information or datasets here. The"
 
51
  )
52
  col1, col2 = st.columns([1, 1], gap="large")
53
  with col1:
54
+ with st.expander("**Create a new dataset**", expanded=True):
 
 
55
 
56
  def create_new_croissant():
57
  st.session_state[Metadata] = Metadata()
 
62
  on_click=create_new_croissant,
63
  type="primary",
64
  )
65
+ with st.expander("**Load an existing dataset**", expanded=True):
66
 
67
  def create_example(dataset: str):
68
  base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
 
87
  )
88
 
89
  dataset = st.selectbox(
90
+ label="Canonical dataset",
91
  options=_DATASETS.keys(),
92
  )
93
  st.button(
 
96
  type="primary",
97
  args=(dataset,),
98
  )
99
+ url = st.text_input(
100
+ label="Hugging Face dataset",
101
+ placeholder="Example: https://huggingface.co/datasets/mnist",
102
+ )
103
+ if url.startswith(_HUGGING_FACE_URL):
104
+ name = url.replace(_HUGGING_FACE_URL, "")
105
+ api_url = (
106
+ f"https://datasets-server.huggingface.co/croissant?dataset={name}"
107
+ )
108
+ json = requests.get(api_url, headers=None).json()
109
+ try:
110
+ metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
111
+ st.session_state[Metadata] = Metadata.from_canonical(metadata)
112
+ save_current_project()
113
+ except Exception:
114
+ st.error(f"Malformed JSON: {json}")
115
+ elif url:
116
+ st.error(
117
+ f"Unknown URL {url}. Hugging Face URLS should look like"
118
+ f" {_HUGGING_FACE_URL}somedataset."
119
+ )
120
+ render_load()
121
  with col2:
122
+ with st.expander("**Recent projects**", expanded=True):
123
  render_previous_files()