marcenacp commited on
Commit
db55b72
1 Parent(s): 7994e4b

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
app.py CHANGED
@@ -2,6 +2,7 @@ import urllib.parse
2
 
3
  import streamlit as st
4
 
 
5
  from core.constants import OAUTH_CLIENT_ID
6
  from core.constants import OAUTH_STATE
7
  from core.constants import REDIRECT_URI
@@ -14,8 +15,6 @@ from views.splash import render_splash
14
  from views.wizard import render_editor
15
 
16
  st.set_page_config(page_title="Croissant Editor", page_icon="🥐", layout="wide")
17
- col1, col2, col3 = st.columns([10, 1, 1])
18
- col1.header("Croissant Editor")
19
 
20
  init_state()
21
 
@@ -42,6 +41,7 @@ if OAUTH_CLIENT_ID and not user:
42
  state = urllib.parse.quote(OAUTH_STATE, safe="")
43
  scope = urllib.parse.quote("openid profile", safe="")
44
  url = f"https://huggingface.co/oauth/authorize?response_type=code&redirect_uri={redirect_uri}&scope={scope}&client_id={client_id}&state={state}"
 
45
  st.link_button("🤗 Login with Hugging Face", url)
46
  st.stop()
47
 
@@ -59,16 +59,33 @@ def _logout():
59
  _back_to_menu()
60
 
61
 
62
- if OAUTH_CLIENT_ID:
63
- col2.write("\n") # Vertical box to shift the lgout menu
64
- col2.button("Log out", on_click=_logout)
65
-
66
  timestamp = get_project_timestamp()
67
 
 
 
 
 
68
  if timestamp:
69
- col3.write("\n") # Vertical box to shift the button menu
70
- col3.button("Menu", on_click=_back_to_menu)
71
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  should_display_editor = bool(st.session_state.get(CurrentProject))
74
 
 
2
 
3
  import streamlit as st
4
 
5
+ from components.flex import st_flex
6
  from core.constants import OAUTH_CLIENT_ID
7
  from core.constants import OAUTH_STATE
8
  from core.constants import REDIRECT_URI
 
15
  from views.wizard import render_editor
16
 
17
  st.set_page_config(page_title="Croissant Editor", page_icon="🥐", layout="wide")
 
 
18
 
19
  init_state()
20
 
 
41
  state = urllib.parse.quote(OAUTH_STATE, safe="")
42
  scope = urllib.parse.quote("openid profile", safe="")
43
  url = f"https://huggingface.co/oauth/authorize?response_type=code&redirect_uri={redirect_uri}&scope={scope}&client_id={client_id}&state={state}"
44
+ st.header("Croissant Editor")
45
  st.link_button("🤗 Login with Hugging Face", url)
46
  st.stop()
47
 
 
59
  _back_to_menu()
60
 
61
 
 
 
 
 
62
  timestamp = get_project_timestamp()
63
 
64
+ button_width = 73 # This is the best value for the current content of the buttons.
65
+ buttons_widths = []
66
+ if OAUTH_CLIENT_ID:
67
+ buttons_widths.append(button_width)
68
  if timestamp:
69
+ buttons_widths.append(button_width)
70
+ widths = [200, sum(buttons_widths) + 10] # 10 being the space between elements.
71
+
72
+ with st_flex(
73
+ flex_direction="row",
74
+ justify_content="space-between",
75
+ align_items="center",
76
+ widths=widths,
77
+ ):
78
+ st.header("Croissant Editor")
79
+ if OAUTH_CLIENT_ID or timestamp:
80
+ with st_flex(
81
+ flex_direction="row",
82
+ justify_content="space-between",
83
+ widths=buttons_widths,
84
+ ):
85
+ if OAUTH_CLIENT_ID:
86
+ st.button("Log out", on_click=_logout)
87
+ if timestamp:
88
+ st.button("Home", on_click=_back_to_menu)
89
 
90
  should_display_editor = bool(st.session_state.get(CurrentProject))
91
 
components/flex/__init__.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+
3
+ import streamlit as st
4
+ import streamlit.components.v1 as components
5
+
6
+
7
+ @contextlib.contextmanager
8
+ def st_flex(
9
+ flex_direction="null",
10
+ justify_content="null",
11
+ align_items="null",
12
+ flex="null",
13
+ widths=None,
14
+ ):
15
+ """[flex](https://developer.mozilla.org/en-US/docs/Web/CSS/flex) for Streamlit.
16
+
17
+ Warning: This custom component uses a lot of heuristics. But styling flex is
18
+ important in CSS and missing from Streamlit. st.columns does a poor job at
19
+ horizontally aligning elements.
20
+
21
+ Args:
22
+ flex_direction: https://developer.mozilla.org/en-US/docs/Web/CSS/flex-direction
23
+ justify_content:
24
+ https://developer.mozilla.org/en-US/docs/Web/CSS/justify-content
25
+ align_items: https://developer.mozilla.org/en-US/docs/Web/CSS/align-items
26
+ flex: https://developer.mozilla.org/en-US/docs/Web/CSS/flex
27
+ widths: An array containing the minimal widths of all elements. This somewhat
28
+ defeats the purpose of flex, but Streamlit forces the width of elements,
29
+ which is why forcing this parameter is unfortunately needed.
30
+ """
31
+ placeholder = st.empty()
32
+ with placeholder.container():
33
+ placeholder = st.empty()
34
+ with placeholder.container():
35
+ yield
36
+ components.html(
37
+ f"""
38
+ <script>
39
+ window.frameElement.style.display = 'none';
40
+ // Get the current script node
41
+ const frameElement = window.frameElement.parentElement;
42
+
43
+ // Get the parent element
44
+ const parentElement = frameElement.parentElement;
45
+
46
+ // Change container
47
+ const container = parentElement.firstChild.firstChild;
48
+ container.style.display = 'flex';
49
+ container.style.flexDirection = '{flex_direction}';
50
+ container.style.justifyContent = '{justify_content}';
51
+ container.style.flex = '{flex}';
52
+ container.style.alignItems = '{align_items}';
53
+ container.width = '';
54
+ container.className = '';
55
+
56
+ // Change children
57
+ let i = 0;
58
+ for (const child of container.children) {{
59
+ child.style.width = `${{{widths}?.[i] || 60}}px`;
60
+ child.className = '';
61
+ i += 1;
62
+ }}
63
+ </script>""",
64
+ )
core/state.py CHANGED
@@ -183,10 +183,8 @@ class Metadata:
183
  name: str = ""
184
  description: str | None = None
185
  citation: str | None = None
186
- creator: mlc.PersonOrOrganization | None = None
187
  data_biases: str | None = None
188
  data_collection: str | None = None
189
- date_published: datetime.datetime | None = None
190
  license: str | None = ""
191
  personal_sensitive_information: str | None = None
192
  url: str = ""
 
183
  name: str = ""
184
  description: str | None = None
185
  citation: str | None = None
 
186
  data_biases: str | None = None
187
  data_collection: str | None = None
 
188
  license: str | None = ""
189
  personal_sensitive_information: str | None = None
190
  url: str = ""
events/metadata.py CHANGED
@@ -1,11 +1,9 @@
1
- import datetime
2
  import enum
3
 
4
  import streamlit as st
5
 
6
  from core.names import find_unique_name
7
  from core.state import Metadata
8
- import mlcroissant as mlc
9
 
10
  # List from:
11
  LICENSES_URL = "https://huggingface.co/docs/hub/repositories-licenses"
@@ -93,7 +91,6 @@ class MetadataEvent(enum.Enum):
93
 
94
  NAME = "NAME"
95
  DESCRIPTION = "DESCRIPTION"
96
- DATE_PUBLISHED = "DATE_PUBLISHED"
97
  URL = "URL"
98
  LICENSE = "LICENSE"
99
  CITATION = "CITATION"
@@ -101,10 +98,6 @@ class MetadataEvent(enum.Enum):
101
  DATA_BIASES = "DATA_BIASES"
102
  DATA_COLLECTION = "DATA_COLLECTION"
103
  PERSONAL_SENSITIVE_INFORMATION = "PERSONAL_SENSITIVE_INFORMATION"
104
- CREATOR_ADD = "CREATOR_ADD"
105
- CREATOR_NAME = "CREATOR_NAME"
106
- CREATOR_URL = "CREATOR_URL"
107
- CREATOR_REMOVE = "CREATOR_REMOVE"
108
 
109
 
110
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
@@ -126,18 +119,3 @@ def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
126
  metadata.data_collection = st.session_state[key]
127
  elif event == MetadataEvent.PERSONAL_SENSITIVE_INFORMATION:
128
  metadata.personal_sensitive_information = st.session_state[key]
129
- elif event == MetadataEvent.DATE_PUBLISHED:
130
- date = st.session_state[key]
131
- metadata.date_published = datetime.datetime(date.year, date.month, date.day)
132
- elif event == MetadataEvent.CREATOR_ADD:
133
- metadata.creator = mlc.PersonOrOrganization()
134
- elif event == MetadataEvent.CREATOR_REMOVE:
135
- metadata.creator = None
136
- elif event == MetadataEvent.CREATOR_NAME:
137
- if not metadata.creator:
138
- metadata.creator = mlc.PersonOrOrganization()
139
- metadata.creator.name = st.session_state[key]
140
- elif event == MetadataEvent.CREATOR_URL:
141
- if not metadata.creator:
142
- metadata.creator = mlc.PersonOrOrganization()
143
- metadata.creator.url = st.session_state[key]
 
 
1
  import enum
2
 
3
  import streamlit as st
4
 
5
  from core.names import find_unique_name
6
  from core.state import Metadata
 
7
 
8
  # List from:
9
  LICENSES_URL = "https://huggingface.co/docs/hub/repositories-licenses"
 
91
 
92
  NAME = "NAME"
93
  DESCRIPTION = "DESCRIPTION"
 
94
  URL = "URL"
95
  LICENSE = "LICENSE"
96
  CITATION = "CITATION"
 
98
  DATA_BIASES = "DATA_BIASES"
99
  DATA_COLLECTION = "DATA_COLLECTION"
100
  PERSONAL_SENSITIVE_INFORMATION = "PERSONAL_SENSITIVE_INFORMATION"
 
 
 
 
101
 
102
 
103
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
 
119
  metadata.data_collection = st.session_state[key]
120
  elif event == MetadataEvent.PERSONAL_SENSITIVE_INFORMATION:
121
  metadata.personal_sensitive_information = st.session_state[key]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  etils[epath]
2
- mlcroissant==0.0.5
3
  numpy
4
  pandas
5
  pytest
 
1
  etils[epath]
2
+ mlcroissant
3
  numpy
4
  pandas
5
  pytest
views/load.py CHANGED
@@ -8,6 +8,17 @@ from core.past_projects import save_current_project
8
  from core.state import Metadata
9
  import mlcroissant as mlc
10
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def _on_file_upload(key):
13
  """Triggers when a new file gets uploaded to load the Croissant metadata."""
@@ -29,6 +40,7 @@ def _on_file_upload(key):
29
 
30
  def render_load():
31
  key = "json-ld-file-upload"
 
32
  st.file_uploader(
33
  "Drop a JSON-LD", type="json", key=key, on_change=_on_file_upload, args=(key,)
34
  )
 
8
  from core.state import Metadata
9
  import mlcroissant as mlc
10
 
11
+ _JSON_LD_INFO = """You can download JSON-LD Croissant files from major dataset
12
+ providers:
13
+
14
+ - [Kaggle](https://www.kaggle.com/datasets) embeds Croissant JSON-LD directly in their
15
+ HTML.
16
+ - [OpenML](https://www.openml.org/search?type=data) offers a 🥐 button on all of their
17
+ datasets.
18
+ - [Hugging Face](https://huggingface.co/) offers an
19
+ [API endpoint](https://datasets-server.huggingface.co/croissant?dataset=${dataset_id) to
20
+ build a Croissant JSON-LD."""
21
+
22
 
23
  def _on_file_upload(key):
24
  """Triggers when a new file gets uploaded to load the Croissant metadata."""
 
40
 
41
  def render_load():
42
  key = "json-ld-file-upload"
43
+ st.info(_JSON_LD_INFO)
44
  st.file_uploader(
45
  "Drop a JSON-LD", type="json", key=key, on_change=_on_file_upload, args=(key,)
46
  )
views/metadata.py CHANGED
@@ -106,47 +106,3 @@ def _render_generic_metadata(metadata: Metadata):
106
  on_change=handle_metadata_change,
107
  args=(MetadataEvent.CITATION, metadata, key),
108
  )
109
- key = "metadata-date-published"
110
- st.date_input(
111
- label="Date of first broadcast/publication.",
112
- key=key,
113
- value=metadata.date_published,
114
- on_change=handle_metadata_change,
115
- args=(MetadataEvent.DATE_PUBLISHED, metadata, key),
116
- )
117
- if metadata.creator:
118
- col1, col2, col3 = st.columns([1, 1, 1])
119
- key = "metadata-creator-name"
120
- col1.text_input(
121
- label="Creator name",
122
- key=key,
123
- value=metadata.creator.name,
124
- on_change=handle_metadata_change,
125
- placeholder="A person or an organization",
126
- args=(MetadataEvent.CREATOR_NAME, metadata, key),
127
- )
128
- key = "metadata-creator-url"
129
- col2.text_input(
130
- label="Creator URL",
131
- key=key,
132
- value=metadata.creator.url,
133
- placeholder="https://mlcommons.org",
134
- on_change=handle_metadata_change,
135
- args=(MetadataEvent.CREATOR_URL, metadata, key),
136
- )
137
- key = "metadata-creator-remove"
138
- col3.button(
139
- "✖️",
140
- key=key,
141
- help="Remove the creator",
142
- on_click=handle_metadata_change,
143
- args=(MetadataEvent.CREATOR_REMOVE, metadata, key),
144
- )
145
- else:
146
- key = "metadata-add-creator"
147
- st.button(
148
- label="✚ Add a creator",
149
- key=key,
150
- on_click=handle_metadata_change,
151
- args=(MetadataEvent.CREATOR_ADD, metadata, key),
152
- )
 
106
  on_change=handle_metadata_change,
107
  args=(MetadataEvent.CITATION, metadata, key),
108
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
views/overview.py CHANGED
@@ -48,7 +48,7 @@ def render_overview():
48
  col1, col2 = st.columns([1, 1], gap="medium")
49
  with col1:
50
  key = "metadata-name"
51
- name = st.text_input(
52
  label=needed_field("Name"),
53
  key=key,
54
  value=metadata.name,
@@ -57,8 +57,6 @@ def render_overview():
57
  on_change=handle_metadata_change,
58
  args=(MetadataEvent.NAME, metadata, key),
59
  )
60
- if not name:
61
- st.stop()
62
  key = "metadata-description"
63
  st.text_area(
64
  label="Description",
 
48
  col1, col2 = st.columns([1, 1], gap="medium")
49
  with col1:
50
  key = "metadata-name"
51
+ st.text_input(
52
  label=needed_field("Name"),
53
  key=key,
54
  value=metadata.name,
 
57
  on_change=handle_metadata_change,
58
  args=(MetadataEvent.NAME, metadata, key),
59
  )
 
 
60
  key = "metadata-description"
61
  st.text_area(
62
  label="Description",
views/record_sets.py CHANGED
@@ -375,7 +375,7 @@ def _render_left_panel():
375
  result: _Result = _generate_data_with_timeout(record_set)
376
  df, exception = result.get("df"), result.get("exception")
377
  if exception is None and df is not None and not df.empty:
378
- st.markdown("Previsualize the data:")
379
  st.dataframe(df, use_container_width=True)
380
  # The generation is not triggered if record_set has in-line `data`.
381
  elif not record_set.data:
 
375
  result: _Result = _generate_data_with_timeout(record_set)
376
  df, exception = result.get("df"), result.get("exception")
377
  if exception is None and df is not None and not df.empty:
378
+ st.markdown("Preview the data:")
379
  st.dataframe(df, use_container_width=True)
380
  # The generation is not triggered if record_set has in-line `data`.
381
  elif not record_set.data:
views/splash.py CHANGED
@@ -37,7 +37,7 @@ an existing Croissant JSON-MD file. Finally, you can also select any of your
37
  past projects from the list.
38
 
39
  You can change the project you are currently editing at any time by clicking
40
- the Menu button and then choosing one of the options on this page."""
41
 
42
 
43
  def render_splash():
@@ -96,28 +96,6 @@ def render_splash():
96
  type="primary",
97
  args=(dataset,),
98
  )
99
- url = st.text_input(
100
- label="Hugging Face dataset",
101
- placeholder="Example: https://huggingface.co/datasets/mnist",
102
- )
103
- if url.startswith(_HUGGING_FACE_URL):
104
- name = url.replace(_HUGGING_FACE_URL, "")
105
- api_url = (
106
- f"https://datasets-server.huggingface.co/croissant?dataset={name}"
107
- )
108
- json = requests.get(api_url, headers=None).json()
109
- try:
110
- metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
111
- st.session_state[Metadata] = Metadata.from_canonical(metadata)
112
- save_current_project()
113
- st.rerun()
114
- except Exception:
115
- st.error(f"Malformed JSON: {json}")
116
- elif url:
117
- st.error(
118
- f"Unknown URL {url}. Hugging Face URLS should look like"
119
- f" {_HUGGING_FACE_URL}somedataset."
120
- )
121
  render_load()
122
  with col2:
123
  with st.expander("**Recent projects**", expanded=True):
 
37
  past projects from the list.
38
 
39
  You can change the project you are currently editing at any time by clicking
40
+ the Home button and then choosing one of the options on this page."""
41
 
42
 
43
  def render_splash():
 
96
  type="primary",
97
  args=(dataset,),
98
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  render_load()
100
  with col2:
101
  with st.expander("**Recent projects**", expanded=True):