marcenacp commited on
Commit
7b9203f
1 Parent(s): c439062

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
components/safe_button.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ HAS_CONFIRMED = "SAFELY_UPDATE"
4
+
5
+
6
+ def handle_on_click(on_click):
7
+ """Handles on_click by waiting for the confirmation."""
8
+ if st.session_state.get(HAS_CONFIRMED):
9
+ return on_click
10
+ else:
11
+
12
+ def toggle_has_confirmed(*args, **kwargs):
13
+ del args, kwargs # unused.
14
+ st.session_state[HAS_CONFIRMED] = not st.session_state.get(HAS_CONFIRMED)
15
+
16
+ return toggle_has_confirmed
17
+
18
+
19
+ def button_with_confirmation(
20
+ label: str,
21
+ key: str = None,
22
+ on_click=None,
23
+ args=None,
24
+ kwargs=None,
25
+ ):
26
+ """Implements a safe button that asks for confirmation before executing on_click."""
27
+ st.button(
28
+ label,
29
+ on_click=handle_on_click(on_click),
30
+ args=args,
31
+ kwargs=kwargs,
32
+ key=key,
33
+ type="secondary",
34
+ )
35
+ if st.session_state.get(HAS_CONFIRMED):
36
+ st.error(f"Do you really want to {label.lower()}? Click again to confirm.")
components/tabs/frontend/build/asset-manifest.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "files": {
3
- "main.js": "./static/js/main.716a0ab4.js",
4
  "index.html": "./index.html",
5
- "main.716a0ab4.js.map": "./static/js/main.716a0ab4.js.map"
6
  },
7
  "entrypoints": [
8
- "static/js/main.716a0ab4.js"
9
  ]
10
  }
 
1
  {
2
  "files": {
3
+ "main.js": "./static/js/main.e6b754d8.js",
4
  "index.html": "./index.html",
5
+ "main.e6b754d8.js.map": "./static/js/main.e6b754d8.js.map"
6
  },
7
  "entrypoints": [
8
+ "static/js/main.e6b754d8.js"
9
  ]
10
  }
components/tabs/frontend/build/index.html CHANGED
@@ -1 +1 @@
1
- <!doctype html><html lang="en"><head><title>Streamlit Tabs Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.716a0ab4.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
 
1
+ <!doctype html><html lang="en"><head><title>Streamlit Tabs Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.e6b754d8.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
components/tabs/frontend/build/static/js/main.e6b754d8.js ADDED
The diff for this file is too large to render. See raw diff
 
components/tabs/frontend/build/static/js/main.e6b754d8.js.LICENSE.txt ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ object-assign
3
+ (c) Sindre Sorhus
4
+ @license MIT
5
+ */
6
+
7
+ /**
8
+ * @license React
9
+ * react-dom.production.min.js
10
+ *
11
+ * Copyright (c) Facebook, Inc. and its affiliates.
12
+ *
13
+ * This source code is licensed under the MIT license found in the
14
+ * LICENSE file in the root directory of this source tree.
15
+ */
16
+
17
+ /**
18
+ * @license React
19
+ * react-is.production.min.js
20
+ *
21
+ * Copyright (c) Facebook, Inc. and its affiliates.
22
+ *
23
+ * This source code is licensed under the MIT license found in the
24
+ * LICENSE file in the root directory of this source tree.
25
+ */
26
+
27
+ /**
28
+ * @license React
29
+ * react-jsx-runtime.production.min.js
30
+ *
31
+ * Copyright (c) Facebook, Inc. and its affiliates.
32
+ *
33
+ * This source code is licensed under the MIT license found in the
34
+ * LICENSE file in the root directory of this source tree.
35
+ */
36
+
37
+ /**
38
+ * @license React
39
+ * react.production.min.js
40
+ *
41
+ * Copyright (c) Facebook, Inc. and its affiliates.
42
+ *
43
+ * This source code is licensed under the MIT license found in the
44
+ * LICENSE file in the root directory of this source tree.
45
+ */
46
+
47
+ /**
48
+ * @license React
49
+ * scheduler.production.min.js
50
+ *
51
+ * Copyright (c) Facebook, Inc. and its affiliates.
52
+ *
53
+ * This source code is licensed under the MIT license found in the
54
+ * LICENSE file in the root directory of this source tree.
55
+ */
56
+
57
+ /** @license React v16.13.1
58
+ * react-is.production.min.js
59
+ *
60
+ * Copyright (c) Facebook, Inc. and its affiliates.
61
+ *
62
+ * This source code is licensed under the MIT license found in the
63
+ * LICENSE file in the root directory of this source tree.
64
+ */
65
+
66
+ /** @license React v16.14.0
67
+ * react.production.min.js
68
+ *
69
+ * Copyright (c) Facebook, Inc. and its affiliates.
70
+ *
71
+ * This source code is licensed under the MIT license found in the
72
+ * LICENSE file in the root directory of this source tree.
73
+ */
components/tabs/frontend/build/static/js/main.e6b754d8.js.map ADDED
The diff for this file is too large to render. See raw diff
 
components/tabs/frontend/src/Tabs.tsx CHANGED
@@ -10,6 +10,7 @@ import Tab from "@mui/material/Tab"
10
  import Box from "@mui/material/Box"
11
  import { ThemeProvider, createTheme } from "@mui/material"
12
  import { orange } from "@mui/material/colors"
 
13
 
14
  const theme = createTheme({
15
  palette: {
@@ -55,18 +56,38 @@ function BasicTabs({
55
  </Tabs>
56
  </Box>
57
  </Box>
58
- <Button
59
- disabled={!json}
60
- variant="outlined"
61
- href={
62
  json
63
- ? `data:text/json;charset=utf-8,${encodeURIComponent(json.content)}`
64
- : ""
65
  }
66
- download={json ? json.name : ""}
67
  >
68
- Export
69
- </Button>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  </div>
71
  )
72
  }
 
10
  import Box from "@mui/material/Box"
11
  import { ThemeProvider, createTheme } from "@mui/material"
12
  import { orange } from "@mui/material/colors"
13
+ import Tooltip from "@mui/material/Tooltip"
14
 
15
  const theme = createTheme({
16
  palette: {
 
56
  </Tabs>
57
  </Box>
58
  </Box>
59
+ <Tooltip
60
+ title={
 
 
61
  json
62
+ ? "Download the Croissant JSON-LD file."
63
+ : "Go to the overview to understand why the Croissant JSON-LD file cannot be generated."
64
  }
65
+ placement="left"
66
  >
67
+ <span>
68
+ <Button
69
+ disabled={!json}
70
+ disableElevation
71
+ variant="contained"
72
+ href={
73
+ json
74
+ ? `data:text/json;charset=utf-8,${encodeURIComponent(
75
+ json.content
76
+ )}`
77
+ : ""
78
+ }
79
+ download={json ? json.name : ""}
80
+ sx={{
81
+ color: "white",
82
+ padding: "6px 20px",
83
+ textAlign: "center",
84
+ whiteSpace: "nowrap",
85
+ }}
86
+ >
87
+ Download 🥐 file
88
+ </Button>
89
+ </span>
90
+ </Tooltip>
91
  </div>
92
  )
93
  }
core/constants.py CHANGED
@@ -35,3 +35,8 @@ METADATA = "Metadata"
35
  RESOURCES = "Resources"
36
  RECORD_SETS = "Record Sets"
37
  TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
 
 
 
 
 
 
35
  RESOURCES = "Resources"
36
  RECORD_SETS = "Record Sets"
37
  TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
38
+
39
+ NAMES_INFO = (
40
+ "Names are used as identifiers. They are unique and cannot contain special"
41
+ " characters. The interface will replace any special characters."
42
+ )
core/files.py CHANGED
@@ -142,7 +142,10 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
142
  elif file_type == FileTypes.PARQUET:
143
  df = pd.read_parquet(file)
144
  else:
145
- raise NotImplementedError()
 
 
 
146
  return df.infer_objects()
147
 
148
 
 
142
  elif file_type == FileTypes.PARQUET:
143
  df = pd.read_parquet(file)
144
  else:
145
+ raise NotImplementedError(
146
+ f"File type {file_type} is not supported. Please, open an issue on GitHub:"
147
+ " https://github.com/mlcommons/croissant/issues/new"
148
+ )
149
  return df.infer_objects()
150
 
151
 
core/names.py CHANGED
@@ -1,8 +1,13 @@
1
  """Module to handle naming of RecordSets and distribution."""
2
 
 
 
 
 
3
 
4
  def find_unique_name(names: set[str], name: str):
5
  """Find a unique UID."""
 
6
  while name in names:
7
  name = f"{name}_0"
8
  return name
 
1
  """Module to handle naming of RecordSets and distribution."""
2
 
3
+ import re
4
+
5
+ NAME_PATTERN_REGEX = "[^a-zA-Z0-9\\-_\\.]"
6
+
7
 
8
  def find_unique_name(names: set[str], name: str):
9
  """Find a unique UID."""
10
+ name = re.sub(NAME_PATTERN_REGEX, "_", name)
11
  while name in names:
12
  name = f"{name}_0"
13
  return name
core/names_test.py CHANGED
@@ -5,6 +5,7 @@ from .names import find_unique_name
5
 
6
  def test_find_unique_name():
7
  names = set(["first", "second", "first_0"])
 
8
  assert find_unique_name(names, "first") == "first_0_0"
9
  assert find_unique_name(names, "second") == "second_0"
10
  assert find_unique_name(names, "third") == "third"
 
5
 
6
  def test_find_unique_name():
7
  names = set(["first", "second", "first_0"])
8
+ assert find_unique_name(names, "are there spaces") == "are_there_spaces"
9
  assert find_unique_name(names, "first") == "first_0_0"
10
  assert find_unique_name(names, "second") == "second_0"
11
  assert find_unique_name(names, "third") == "third"
events/metadata.py CHANGED
@@ -2,6 +2,7 @@ import enum
2
 
3
  import streamlit as st
4
 
 
5
  from core.state import Metadata
6
 
7
  # List from:
@@ -97,7 +98,7 @@ class MetadataEvent(enum.Enum):
97
 
98
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
99
  if event == MetadataEvent.NAME:
100
- metadata.name = st.session_state[key]
101
  elif event == MetadataEvent.DESCRIPTION:
102
  metadata.description = st.session_state[key]
103
  elif event == MetadataEvent.LICENSE:
 
2
 
3
  import streamlit as st
4
 
5
+ from core.names import find_unique_name
6
  from core.state import Metadata
7
 
8
  # List from:
 
98
 
99
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
100
  if event == MetadataEvent.NAME:
101
+ metadata.name = find_unique_name(set(), st.session_state[key])
102
  elif event == MetadataEvent.DESCRIPTION:
103
  metadata.description = st.session_state[key]
104
  elif event == MetadataEvent.LICENSE:
views/files.py CHANGED
@@ -1,8 +1,9 @@
1
- from etils import epath
2
  import streamlit as st
3
 
 
4
  from components.tree import render_tree
5
  from core.constants import DF_HEIGHT
 
6
  from core.constants import OAUTH_CLIENT_ID
7
  from core.files import code_to_index
8
  from core.files import file_from_form
@@ -188,9 +189,10 @@ def _render_resource_details(selected_file: Resource):
188
 
189
  col1, col2 = st.columns([1, 1])
190
  col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
191
- col2.button(
192
- "⚠️ Remove", key=f"{i}_remove", on_click=delete_line, type="secondary"
193
- )
 
194
 
195
 
196
  def _render_resource(prefix: int, file: Resource, is_file_object: bool):
@@ -201,6 +203,11 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
201
  default=file.contained_in,
202
  options=parent_options,
203
  key=key,
 
 
 
 
 
204
  on_change=handle_resource_change,
205
  args=(ResourceEvent.CONTAINED_IN, file, key),
206
  )
@@ -209,6 +216,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
209
  needed_field("Name"),
210
  value=file.name,
211
  key=key,
 
212
  on_change=handle_resource_change,
213
  args=(ResourceEvent.NAME, file, key),
214
  )
@@ -224,9 +232,10 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
224
  if is_file_object:
225
  key = f"{prefix}_content_url"
226
  st.text_input(
227
- needed_field("Content URL"),
228
  value=file.content_url,
229
  key=key,
 
230
  on_change=handle_resource_change,
231
  args=(ResourceEvent.CONTENT_URL, file, key),
232
  )
@@ -243,6 +252,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
243
  "Content size",
244
  value=file.content_size,
245
  key=key,
 
246
  on_change=handle_resource_change,
247
  args=(ResourceEvent.CONTENT_SIZE, file, key),
248
  )
@@ -261,6 +271,10 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
261
  index=code_to_index(file.encoding_format),
262
  options=FILE_TYPES.keys(),
263
  key=key,
 
 
 
 
264
  on_change=handle_resource_change,
265
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
266
  )
@@ -269,4 +283,9 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
269
  if file.df is not None:
270
  st.dataframe(file.df, height=DF_HEIGHT)
271
  else:
272
- st.button("Trigger download", on_click=trigger_download, args=(file,))
 
 
 
 
 
 
 
1
  import streamlit as st
2
 
3
+ from components.safe_button import button_with_confirmation
4
  from components.tree import render_tree
5
  from core.constants import DF_HEIGHT
6
+ from core.constants import NAMES_INFO
7
  from core.constants import OAUTH_CLIENT_ID
8
  from core.files import code_to_index
9
  from core.files import file_from_form
 
189
 
190
  col1, col2 = st.columns([1, 1])
191
  col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
192
+ with col2:
193
+ button_with_confirmation(
194
+ "Remove", key=f"{i}_remove", on_click=delete_line
195
+ )
196
 
197
 
198
  def _render_resource(prefix: int, file: Resource, is_file_object: bool):
 
203
  default=file.contained_in,
204
  options=parent_options,
205
  key=key,
206
+ help=(
207
+ "FileObjects and FileSets can be nested. Specifying `Parents` allows to"
208
+ " nest a FileObject/FileSet within another FileObject/FileSet. An example"
209
+ " of this is when images (FileSet) are nested within an archive (FileSet)."
210
+ ),
211
  on_change=handle_resource_change,
212
  args=(ResourceEvent.CONTAINED_IN, file, key),
213
  )
 
216
  needed_field("Name"),
217
  value=file.name,
218
  key=key,
219
+ help=f"The name of the resource. {NAMES_INFO}",
220
  on_change=handle_resource_change,
221
  args=(ResourceEvent.NAME, file, key),
222
  )
 
232
  if is_file_object:
233
  key = f"{prefix}_content_url"
234
  st.text_input(
235
+ needed_field("Content URL or local path"),
236
  value=file.content_url,
237
  key=key,
238
+ help="The URL or local file path pointing to the original FileObject.",
239
  on_change=handle_resource_change,
240
  args=(ResourceEvent.CONTENT_URL, file, key),
241
  )
 
252
  "Content size",
253
  value=file.content_size,
254
  key=key,
255
+ help="The size of the original FileObject in bytes.",
256
  on_change=handle_resource_change,
257
  args=(ResourceEvent.CONTENT_SIZE, file, key),
258
  )
 
271
  index=code_to_index(file.encoding_format),
272
  options=FILE_TYPES.keys(),
273
  key=key,
274
+ help=(
275
+ "MIME type corresponding to"
276
+ " ([sc:encodingFormat](https://schema.org/encodingFormat))."
277
+ ),
278
  on_change=handle_resource_change,
279
  args=(ResourceEvent.ENCODING_FORMAT, file, key),
280
  )
 
283
  if file.df is not None:
284
  st.dataframe(file.df, height=DF_HEIGHT)
285
  else:
286
+ st.button(
287
+ "Trigger download",
288
+ disabled=bool(file.content_url),
289
+ on_click=trigger_download,
290
+ args=(file,),
291
+ )
views/overview.py CHANGED
@@ -3,6 +3,7 @@ from typing import Any
3
 
4
  import streamlit as st
5
 
 
6
  from core.state import Metadata
7
  import mlcroissant as mlc
8
  from utils import needed_field
@@ -51,6 +52,7 @@ def render_overview():
51
  label=needed_field("Name"),
52
  key=key,
53
  value=metadata.name,
 
54
  placeholder="Dataset",
55
  on_change=handle_metadata_change,
56
  args=(MetadataEvent.NAME, metadata, key),
@@ -82,8 +84,15 @@ def render_overview():
82
  * 100
83
  / (3 * metadata_weight)
84
  )
85
- col_a.metric("Completion", f"{completion}%")
86
- col_b.metric("Number of metadata", fields)
 
 
 
 
 
 
 
87
  col_c.metric("Number of resources", len(metadata.distribution))
88
  col_d.metric("Number of RecordSets", len(metadata.record_sets))
89
  with col2:
@@ -96,10 +105,6 @@ def render_overview():
96
  warning += "**Errors**\n"
97
  for error in issues.errors:
98
  warning += f"{error}\n"
99
- if issues.warnings:
100
- warning += "**Warnings**\n"
101
- for warning in issues.warnings:
102
- warning += f"{warning}\n"
103
  except mlc.ValidationError as exception:
104
  warning += "**Errors**\n"
105
  warning += f"{str(exception)}\n"
 
3
 
4
  import streamlit as st
5
 
6
+ from core.constants import NAMES_INFO
7
  from core.state import Metadata
8
  import mlcroissant as mlc
9
  from utils import needed_field
 
52
  label=needed_field("Name"),
53
  key=key,
54
  value=metadata.name,
55
+ help=f"The name of the dataset. {NAMES_INFO}",
56
  placeholder="Dataset",
57
  on_change=handle_metadata_change,
58
  args=(MetadataEvent.NAME, metadata, key),
 
84
  * 100
85
  / (3 * metadata_weight)
86
  )
87
+ col_a.metric(
88
+ "Completion",
89
+ f"{completion}%",
90
+ help=(
91
+ "Approximation of the total completion based on the number of fields"
92
+ " that are filled."
93
+ ),
94
+ )
95
+ col_b.metric("Number of metadata fields", fields)
96
  col_c.metric("Number of resources", len(metadata.distribution))
97
  col_d.metric("Number of RecordSets", len(metadata.record_sets))
98
  with col2:
 
105
  warning += "**Errors**\n"
106
  for error in issues.errors:
107
  warning += f"{error}\n"
 
 
 
 
108
  except mlc.ValidationError as exception:
109
  warning += "**Errors**\n"
110
  warning += f"{str(exception)}\n"
views/record_sets.py CHANGED
@@ -9,6 +9,8 @@ import pandas as pd
9
  from rdflib import term
10
  import streamlit as st
11
 
 
 
12
  from core.data_types import MLC_DATA_TYPES
13
  from core.data_types import mlc_to_str_data_type
14
  from core.data_types import STR_DATA_TYPES
@@ -239,6 +241,7 @@ def _render_left_panel():
239
  needed_field("Name"),
240
  placeholder="Name without special character.",
241
  key=key,
 
242
  value=record_set.name,
243
  on_change=handle_record_set_change,
244
  args=(RecordSetEvent.NAME, record_set, key),
@@ -254,16 +257,24 @@ def _render_left_panel():
254
  )
255
  key = f"{prefix}-is-enumeration"
256
  st.checkbox(
257
- "Whether the RecordSet is an enumeration",
258
  key=key,
 
 
 
 
259
  value=record_set.is_enumeration,
260
  on_change=handle_record_set_change,
261
  args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
262
  )
263
  key = f"{prefix}-has-data"
264
  st.checkbox(
265
- "Whether the RecordSet has in-line data",
266
  key=key,
 
 
 
 
267
  value=bool(record_set.data),
268
  on_change=handle_record_set_change,
269
  args=(RecordSetEvent.HAS_DATA, record_set, key),
@@ -271,7 +282,7 @@ def _render_left_panel():
271
 
272
  joins = _find_joins(record_set.fields)
273
  has_join = st.checkbox(
274
- "Whether the RecordSet contains joins. To add a new join, add a field"
275
  " with a source in `RecordSet`/`FileSet`/`FileObject` and a reference"
276
  " to another `RecordSet`/`FileSet`/`FileObject`.",
277
  key=f"{prefix}-has-joins",
@@ -323,8 +334,14 @@ def _render_left_panel():
323
  )
324
  data_editor_key = _data_editor_key(record_set_key, record_set)
325
  st.markdown(
326
- f"{needed_field('Fields')} (add/delete fields by directly editing the"
327
- " table)"
 
 
 
 
 
 
328
  )
329
  st.data_editor(
330
  fields,
@@ -380,9 +397,8 @@ def _render_left_panel():
380
  args=(record_set_key, record_set),
381
  )
382
  key = f"{prefix}-delete-record-set"
383
- st.button(
384
- "⚠️ Delete RecordSet",
385
- type="primary",
386
  key=key,
387
  on_click=_handle_remove_record_set,
388
  args=(record_set_key,),
@@ -437,6 +453,7 @@ def _render_right_panel():
437
  needed_field("Name"),
438
  placeholder="Name without special character.",
439
  key=key,
 
440
  value=field.name,
441
  on_change=handle_field_change,
442
  args=(FieldEvent.NAME, field, key),
@@ -450,32 +467,29 @@ def _render_right_panel():
450
  value=field.description,
451
  args=(FieldEvent.DESCRIPTION, field, key),
452
  )
 
453
  if field.data_types:
454
  data_type = field.data_types[0]
455
  if isinstance(data_type, str):
456
  data_type = term.URIRef(data_type)
457
  if data_type in MLC_DATA_TYPES:
458
  data_type_index = MLC_DATA_TYPES.index(data_type)
459
- else:
460
- data_type_index = None
461
- else:
462
- data_type_index = None
463
  key = f"{prefix}-datatypes"
464
  col3.selectbox(
465
  needed_field("Data type"),
466
  index=data_type_index,
467
  options=STR_DATA_TYPES,
468
  key=key,
 
 
 
 
469
  on_change=handle_field_change,
470
  args=(FieldEvent.DATA_TYPE, field, key),
471
  )
472
  possible_sources = _get_possible_sources(metadata)
473
- render_source(
474
- record_set_key, record_set, field, field_key, possible_sources
475
- )
476
- render_references(
477
- record_set_key, record_set, field, field_key, possible_sources
478
- )
479
 
480
  st.divider()
481
 
 
9
  from rdflib import term
10
  import streamlit as st
11
 
12
+ from components.safe_button import button_with_confirmation
13
+ from core.constants import NAMES_INFO
14
  from core.data_types import MLC_DATA_TYPES
15
  from core.data_types import mlc_to_str_data_type
16
  from core.data_types import STR_DATA_TYPES
 
241
  needed_field("Name"),
242
  placeholder="Name without special character.",
243
  key=key,
244
+ help=f"The name of the RecordSet. {NAMES_INFO}",
245
  value=record_set.name,
246
  on_change=handle_record_set_change,
247
  args=(RecordSetEvent.NAME, record_set, key),
 
257
  )
258
  key = f"{prefix}-is-enumeration"
259
  st.checkbox(
260
+ "The RecordSet is an enumeration",
261
  key=key,
262
+ help=(
263
+ "Enumerations indicate that the RecordSet takes its values in a"
264
+ " finite set. Similar to `ClassLabel` in TFDS or Hugging Face."
265
+ ),
266
  value=record_set.is_enumeration,
267
  on_change=handle_record_set_change,
268
  args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
269
  )
270
  key = f"{prefix}-has-data"
271
  st.checkbox(
272
+ "The RecordSet has in-line data",
273
  key=key,
274
+ help=(
275
+ "In-line data allows to embed data directly within the JSON-LD"
276
+ " without referencing another data source."
277
+ ),
278
  value=bool(record_set.data),
279
  on_change=handle_record_set_change,
280
  args=(RecordSetEvent.HAS_DATA, record_set, key),
 
282
 
283
  joins = _find_joins(record_set.fields)
284
  has_join = st.checkbox(
285
+ "The RecordSet contains joins. To add a new join, add a field"
286
  " with a source in `RecordSet`/`FileSet`/`FileObject` and a reference"
287
  " to another `RecordSet`/`FileSet`/`FileObject`.",
288
  key=f"{prefix}-has-joins",
 
334
  )
335
  data_editor_key = _data_editor_key(record_set_key, record_set)
336
  st.markdown(
337
+ needed_field("Fields"),
338
+ help=(
339
+ "Add/delete fields by directly editing the table. Warning: the"
340
+ " table contains information about the fields--not the data"
341
+ " directly. If you wish to embed data, select `The RecordSet is an"
342
+ " enumeration` above. To edit fields details, click the button"
343
+ " `Edit fields details` below."
344
+ ),
345
  )
346
  st.data_editor(
347
  fields,
 
397
  args=(record_set_key, record_set),
398
  )
399
  key = f"{prefix}-delete-record-set"
400
+ button_with_confirmation(
401
+ "Delete RecordSet",
 
402
  key=key,
403
  on_click=_handle_remove_record_set,
404
  args=(record_set_key,),
 
453
  needed_field("Name"),
454
  placeholder="Name without special character.",
455
  key=key,
456
+ help=f"The name of the field. {NAMES_INFO}",
457
  value=field.name,
458
  on_change=handle_field_change,
459
  args=(FieldEvent.NAME, field, key),
 
467
  value=field.description,
468
  args=(FieldEvent.DESCRIPTION, field, key),
469
  )
470
+ data_type_index = None
471
  if field.data_types:
472
  data_type = field.data_types[0]
473
  if isinstance(data_type, str):
474
  data_type = term.URIRef(data_type)
475
  if data_type in MLC_DATA_TYPES:
476
  data_type_index = MLC_DATA_TYPES.index(data_type)
 
 
 
 
477
  key = f"{prefix}-datatypes"
478
  col3.selectbox(
479
  needed_field("Data type"),
480
  index=data_type_index,
481
  options=STR_DATA_TYPES,
482
  key=key,
483
+ help=(
484
+ "The type of the data. `Text` corresponds to"
485
+ " https://schema.org/Text, etc."
486
+ ),
487
  on_change=handle_field_change,
488
  args=(FieldEvent.DATA_TYPE, field, key),
489
  )
490
  possible_sources = _get_possible_sources(metadata)
491
+ render_source(record_set, field, possible_sources)
492
+ render_references(record_set, field, possible_sources)
 
 
 
 
493
 
494
  st.divider()
495
 
views/source.py CHANGED
@@ -12,6 +12,15 @@ from events.fields import TransformType
12
  import mlcroissant as mlc
13
  from utils import needed_field
14
 
 
 
 
 
 
 
 
 
 
15
 
16
  class SourceType:
17
  """The type of the source (distribution or field)."""
@@ -105,10 +114,8 @@ def _handle_remove_reference(field):
105
 
106
 
107
  def render_source(
108
- record_set_key: int,
109
  record_set: RecordSet,
110
  field: Field,
111
- field_key: int,
112
  possible_sources: list[str],
113
  ):
114
  """Renders the form for the source."""
@@ -123,10 +130,13 @@ def render_source(
123
  index = None
124
  key = f"{prefix}-source"
125
  col1.selectbox(
126
- needed_field("Source"),
127
  index=index,
128
  options=options,
129
  key=key,
 
 
 
130
  on_change=handle_field_change,
131
  args=(FieldEvent.SOURCE, field, key),
132
  )
@@ -135,6 +145,7 @@ def render_source(
135
  needed_field("Extract"),
136
  index=_get_extract_index(source),
137
  key=f"{prefix}-extract",
 
138
  options=EXTRACT_TYPES,
139
  on_change=handle_field_change,
140
  args=(FieldEvent.SOURCE_EXTRACT, field, key),
@@ -145,6 +156,7 @@ def render_source(
145
  needed_field("Column name"),
146
  value=source.extract.column,
147
  key=key,
 
148
  on_change=handle_field_change,
149
  args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
150
  )
@@ -154,6 +166,7 @@ def render_source(
154
  needed_field("JSON path"),
155
  value=source.extract.json_path,
156
  key=key,
 
157
  on_change=handle_field_change,
158
  args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
159
  )
@@ -170,18 +183,23 @@ def render_source(
170
  key=key,
171
  options=TRANSFORM_TYPES,
172
  on_change=handle_field_change,
 
173
  args=(FieldEvent.TRANSFORM, field, key),
174
  kwargs={"number": number},
175
  )
176
  if selected == TransformType.FORMAT:
177
  key = f"{prefix}-{number}-transform-format"
178
  col3.text_input(
179
- needed_field("Format"),
180
  value=transform.format,
181
  key=key,
182
  on_change=handle_field_change,
 
 
 
 
183
  args=(selected, field, key),
184
- kwargs={"number": number, "type": "format"},
185
  )
186
  elif selected == TransformType.JSON_PATH:
187
  key = f"{prefix}-{number}-jsonpath"
@@ -190,8 +208,9 @@ def render_source(
190
  value=transform.json_path,
191
  key=key,
192
  on_change=handle_field_change,
 
193
  args=(selected, field, key),
194
- kwargs={"number": number, "type": "format"},
195
  )
196
  elif selected == TransformType.REGEX:
197
  key = f"{prefix}-{number}-regex"
@@ -200,8 +219,14 @@ def render_source(
200
  value=transform.regex,
201
  key=key,
202
  on_change=handle_field_change,
 
 
 
 
 
 
203
  args=(selected, field, key),
204
- kwargs={"number": number, "type": "format"},
205
  )
206
  elif selected == TransformType.REPLACE:
207
  key = f"{prefix}-{number}-replace"
@@ -210,8 +235,13 @@ def render_source(
210
  value=transform.replace,
211
  key=key,
212
  on_change=handle_field_change,
 
 
 
 
 
213
  args=(selected, field, key),
214
- kwargs={"number": number, "type": "format"},
215
  )
216
  elif selected == TransformType.SEPARATOR:
217
  key = f"{prefix}-{number}-separator"
@@ -220,8 +250,9 @@ def render_source(
220
  value=transform.separator,
221
  key=key,
222
  on_change=handle_field_change,
 
223
  args=(selected, field, key),
224
- kwargs={"number": number, "type": "format"},
225
  )
226
 
227
  def _handle_remove_transform(field, number):
@@ -230,6 +261,7 @@ def render_source(
230
  col4.button(
231
  "✖️",
232
  key=f"{prefix}-{number}-remove-transform",
 
233
  on_click=_handle_remove_transform,
234
  args=(field, number),
235
  )
@@ -243,16 +275,15 @@ def render_source(
243
  col1.button(
244
  "Add transform on data",
245
  key=f"{prefix}-close-fields",
 
246
  on_click=_handle_add_transform,
247
  args=(field,),
248
  )
249
 
250
 
251
  def render_references(
252
- record_set_key: int,
253
  record_set: RecordSet,
254
  field: Field,
255
- field_key: int,
256
  possible_sources: list[str],
257
  ):
258
  """Renders the form for references."""
@@ -286,6 +317,7 @@ def render_references(
286
  index=_get_extract_index(references),
287
  key=key,
288
  options=EXTRACT_TYPES,
 
289
  on_change=handle_field_change,
290
  args=(FieldEvent.REFERENCE_EXTRACT, field, key),
291
  )
@@ -295,6 +327,7 @@ def render_references(
295
  needed_field("Column name"),
296
  value=references.extract.column,
297
  key=key,
 
298
  on_change=handle_field_change,
299
  args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
300
  )
@@ -304,12 +337,14 @@ def render_references(
304
  needed_field("JSON path"),
305
  value=references.extract.json_path,
306
  key=key,
 
307
  on_change=handle_field_change,
308
  args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
309
  )
310
  col4.button(
311
  "✖️",
312
  key=f"{key}-remove-reference",
 
313
  on_click=_handle_remove_reference,
314
  args=(field,),
315
  )
 
12
  import mlcroissant as mlc
13
  from utils import needed_field
14
 
15
+ _JSON_PATH_DOCUMENTATION = (
16
+ "The JSON path if the data source is a JSON (see"
17
+ " [documentation](https://www.ietf.org/archive/id/draft-goessner-dispatch-jsonpath-00.html))."
18
+ )
19
+ _EXTRACT_DOCUMENTATION = (
20
+ "The extraction method to get the value of the field (column in a CSV, etc)."
21
+ )
22
+ _COLUMN_NAME_DOCUMENTATION = "The name of the column if the data source is a CSV."
23
+
24
 
25
  class SourceType:
26
  """The type of the source (distribution or field)."""
 
114
 
115
 
116
  def render_source(
 
117
  record_set: RecordSet,
118
  field: Field,
 
119
  possible_sources: list[str],
120
  ):
121
  """Renders the form for the source."""
 
130
  index = None
131
  key = f"{prefix}-source"
132
  col1.selectbox(
133
+ needed_field("Data source"),
134
  index=index,
135
  options=options,
136
  key=key,
137
+ help=(
138
+ "Data sources can be other resources (FileObject, FileSet) or other fields."
139
+ ),
140
  on_change=handle_field_change,
141
  args=(FieldEvent.SOURCE, field, key),
142
  )
 
145
  needed_field("Extract"),
146
  index=_get_extract_index(source),
147
  key=f"{prefix}-extract",
148
+ help=_EXTRACT_DOCUMENTATION,
149
  options=EXTRACT_TYPES,
150
  on_change=handle_field_change,
151
  args=(FieldEvent.SOURCE_EXTRACT, field, key),
 
156
  needed_field("Column name"),
157
  value=source.extract.column,
158
  key=key,
159
+ help=_COLUMN_NAME_DOCUMENTATION,
160
  on_change=handle_field_change,
161
  args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
162
  )
 
166
  needed_field("JSON path"),
167
  value=source.extract.json_path,
168
  key=key,
169
+ help=_JSON_PATH_DOCUMENTATION,
170
  on_change=handle_field_change,
171
  args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
172
  )
 
183
  key=key,
184
  options=TRANSFORM_TYPES,
185
  on_change=handle_field_change,
186
+ help="One or more transformations to apply after extracting the field.",
187
  args=(FieldEvent.TRANSFORM, field, key),
188
  kwargs={"number": number},
189
  )
190
  if selected == TransformType.FORMAT:
191
  key = f"{prefix}-{number}-transform-format"
192
  col3.text_input(
193
+ needed_field("Format a date"),
194
  value=transform.format,
195
  key=key,
196
  on_change=handle_field_change,
197
+ help=(
198
+ "For dates, use [`Python format"
199
+ " codes`](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)."
200
+ ),
201
  args=(selected, field, key),
202
+ kwargs={"number": number},
203
  )
204
  elif selected == TransformType.JSON_PATH:
205
  key = f"{prefix}-{number}-jsonpath"
 
208
  value=transform.json_path,
209
  key=key,
210
  on_change=handle_field_change,
211
+ help=_JSON_PATH_DOCUMENTATION,
212
  args=(selected, field, key),
213
+ kwargs={"number": number},
214
  )
215
  elif selected == TransformType.REGEX:
216
  key = f"{prefix}-{number}-regex"
 
219
  value=transform.regex,
220
  key=key,
221
  on_change=handle_field_change,
222
+ help=(
223
+ "A regular expression following [`re` Python"
224
+ " convention](https://docs.python.org/3/library/re.html#regular-expression-syntax)"
225
+ " with one capturing group. The result of the operation will be"
226
+ " the last captured group."
227
+ ),
228
  args=(selected, field, key),
229
+ kwargs={"number": number},
230
  )
231
  elif selected == TransformType.REPLACE:
232
  key = f"{prefix}-{number}-replace"
 
235
  value=transform.replace,
236
  key=key,
237
  on_change=handle_field_change,
238
+ help=(
239
+ "A replace pattern separated by a `/`, i.e."
240
+ " `string_to_replace/string_to_substitute` in order to replace"
241
+ " `string_to_replace` by `string_to_substitute`."
242
+ ),
243
  args=(selected, field, key),
244
+ kwargs={"number": number},
245
  )
246
  elif selected == TransformType.SEPARATOR:
247
  key = f"{prefix}-{number}-separator"
 
250
  value=transform.separator,
251
  key=key,
252
  on_change=handle_field_change,
253
+ help="A separator to split strings on, e.g. `|` to split `a|b|c`.",
254
  args=(selected, field, key),
255
+ kwargs={"number": number},
256
  )
257
 
258
  def _handle_remove_transform(field, number):
 
261
  col4.button(
262
  "✖️",
263
  key=f"{prefix}-{number}-remove-transform",
264
+ help="Remove the transformation.",
265
  on_click=_handle_remove_transform,
266
  args=(field, number),
267
  )
 
275
  col1.button(
276
  "Add transform on data",
277
  key=f"{prefix}-close-fields",
278
+ help="Add a transformation.",
279
  on_click=_handle_add_transform,
280
  args=(field,),
281
  )
282
 
283
 
284
  def render_references(
 
285
  record_set: RecordSet,
286
  field: Field,
 
287
  possible_sources: list[str],
288
  ):
289
  """Renders the form for references."""
 
317
  index=_get_extract_index(references),
318
  key=key,
319
  options=EXTRACT_TYPES,
320
+ help=_EXTRACT_DOCUMENTATION,
321
  on_change=handle_field_change,
322
  args=(FieldEvent.REFERENCE_EXTRACT, field, key),
323
  )
 
327
  needed_field("Column name"),
328
  value=references.extract.column,
329
  key=key,
330
+ help=_COLUMN_NAME_DOCUMENTATION,
331
  on_change=handle_field_change,
332
  args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
333
  )
 
337
  needed_field("JSON path"),
338
  value=references.extract.json_path,
339
  key=key,
340
+ help=_JSON_PATH_DOCUMENTATION,
341
  on_change=handle_field_change,
342
  args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
343
  )
344
  col4.button(
345
  "✖️",
346
  key=f"{key}-remove-reference",
347
+ help="Remove the join.",
348
  on_click=_handle_remove_reference,
349
  args=(field,),
350
  )
views/splash.py CHANGED
@@ -13,6 +13,8 @@ import mlcroissant as mlc
13
  from views.load import render_load
14
  from views.previous_files import render_previous_files
15
 
 
 
16
  _DATASETS = {
17
  "Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
18
  "FLORES-200": [],
@@ -34,8 +36,6 @@ def render_splash():
34
  )
35
  col1, col2 = st.columns([1, 1], gap="large")
36
  with col1:
37
- with st.expander("**Load an existing Croissant JSON-LD file**", expanded=True):
38
- render_load()
39
  with st.expander("**Create from scratch**", expanded=True):
40
 
41
  def create_new_croissant():
@@ -81,6 +81,36 @@ def render_splash():
81
  type="primary",
82
  args=(dataset,),
83
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  with col2:
85
  with st.expander("**Past projects**", expanded=True):
86
  render_previous_files()
 
13
  from views.load import render_load
14
  from views.previous_files import render_previous_files
15
 
16
+ _HUGGING_FACE_URL = "https://huggingface.co/datasets/"
17
+
18
  _DATASETS = {
19
  "Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
20
  "FLORES-200": [],
 
36
  )
37
  col1, col2 = st.columns([1, 1], gap="large")
38
  with col1:
 
 
39
  with st.expander("**Create from scratch**", expanded=True):
40
 
41
  def create_new_croissant():
 
81
  type="primary",
82
  args=(dataset,),
83
  )
84
+ with st.expander("**Load a dataset from Hugging Face**", expanded=True):
85
+ url = st.text_input(
86
+ label="Hugging Face URL",
87
+ )
88
+ if url.startswith(_HUGGING_FACE_URL):
89
+ url = url.replace(_HUGGING_FACE_URL, "")
90
+
91
+ def download_huggingface_json(name: str):
92
+ api_url = f"https://datasets-server.huggingface.co/croissant?dataset={name}"
93
+ json = requests.get(api_url, headers=None).json()
94
+ try:
95
+ metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
96
+ st.session_state[Metadata] = Metadata.from_canonical(metadata)
97
+ save_current_project()
98
+ except Exception:
99
+ st.error(f"Malformed JSON: {json}")
100
+
101
+ st.button(
102
+ f'Download "{url}"',
103
+ on_click=download_huggingface_json,
104
+ type="primary",
105
+ args=(url,),
106
+ )
107
+ elif url:
108
+ st.error(
109
+ f"Unknown URL {url}. Hugging Face URLS should look like"
110
+ f" {_HUGGING_FACE_URL}somedataset."
111
+ )
112
+ with st.expander("**Load an existing Croissant JSON-LD file**", expanded=True):
113
+ render_load()
114
  with col2:
115
  with st.expander("**Past projects**", expanded=True):
116
  render_previous_files()