marcenacp commited on
Commit
1b94fec
1 Parent(s): 28bd84e

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
app.py CHANGED
@@ -21,10 +21,10 @@ init_state()
21
  user = get_user()
22
 
23
  if OAUTH_CLIENT_ID and not user:
24
- query_params = st.experimental_get_query_params()
25
- state = query_params.get("state")
26
  if state and state[0] == OAUTH_STATE:
27
- code = query_params.get("code")
28
  if not code:
29
  st.stop()
30
  try:
@@ -34,7 +34,7 @@ if OAUTH_CLIENT_ID and not user:
34
  except:
35
  raise
36
  finally:
37
- st.experimental_set_query_params()
38
  else:
39
  redirect_uri = urllib.parse.quote(REDIRECT_URI, safe="")
40
  client_id = urllib.parse.quote(OAUTH_CLIENT_ID, safe="")
@@ -48,7 +48,7 @@ if OAUTH_CLIENT_ID and not user:
48
 
49
  def _back_to_menu():
50
  """Sends the user back to the menu."""
51
- st.experimental_set_query_params()
52
  init_state(force=True)
53
 
54
 
 
21
  user = get_user()
22
 
23
  if OAUTH_CLIENT_ID and not user:
24
+ query_params = st.query_params
25
+ state = query_params.get_all("state")
26
  if state and state[0] == OAUTH_STATE:
27
+ code = query_params["code"]
28
  if not code:
29
  st.stop()
30
  try:
 
34
  except:
35
  raise
36
  finally:
37
+ st.query_params.clear()
38
  else:
39
  redirect_uri = urllib.parse.quote(REDIRECT_URI, safe="")
40
  client_id = urllib.parse.quote(OAUTH_CLIENT_ID, safe="")
 
48
 
49
  def _back_to_menu():
50
  """Sends the user back to the menu."""
51
+ st.query_params.clear()
52
  init_state(force=True)
53
 
54
 
core/query_params.py CHANGED
@@ -15,28 +15,24 @@ class QueryParams:
15
  OPEN_RECORD_SET = "recordSet"
16
 
17
 
18
- def _get_query_param(params: dict[str, Any], name: str) -> str | None:
19
  """Gets query param with the name `name`."""
20
- if name in params:
21
- param = params[name]
22
- if isinstance(param, list) and len(param) > 0:
23
- return param[0]
24
  return None
25
 
26
 
27
  def _set_query_param(param: str, new_value: str) -> str | None:
28
- params = st.experimental_get_query_params()
29
- if params.get(param) == [new_value]:
30
  # The value already exists in the query params.
31
  return
32
- new_params = {k: v for k, v in params.items() if k != param}
33
- new_params[param] = new_value
34
- st.experimental_set_query_params(**new_params)
35
 
36
 
37
  def is_record_set_expanded(record_set: RecordSet) -> bool:
38
- params = st.experimental_get_query_params()
39
- open_record_set_name = _get_query_param(params, QueryParams.OPEN_RECORD_SET)
40
  if open_record_set_name:
41
  return open_record_set_name == record_set.name
42
  return False
@@ -47,8 +43,7 @@ def expand_record_set(record_set: RecordSet) -> None:
47
 
48
 
49
  def get_project_timestamp() -> str | None:
50
- params = st.experimental_get_query_params()
51
- return _get_query_param(params, QueryParams.OPEN_PROJECT)
52
 
53
 
54
  def set_project(project: CurrentProject):
 
15
  OPEN_RECORD_SET = "recordSet"
16
 
17
 
18
+ def _get_query_param(name: str) -> str | None:
19
  """Gets query param with the name `name`."""
20
+ param = st.query_params.get_all(name)
21
+ if isinstance(param, list) and len(param) > 0:
22
+ return param[0]
 
23
  return None
24
 
25
 
26
  def _set_query_param(param: str, new_value: str) -> str | None:
27
+ params = st.query_params
28
+ if params.get_all(param) == [new_value]:
29
  # The value already exists in the query params.
30
  return
31
+ params[param] = new_value
 
 
32
 
33
 
34
  def is_record_set_expanded(record_set: RecordSet) -> bool:
35
+ open_record_set_name = _get_query_param(QueryParams.OPEN_RECORD_SET)
 
36
  if open_record_set_name:
37
  return open_record_set_name == record_set.name
38
  return False
 
43
 
44
 
45
  def get_project_timestamp() -> str | None:
46
+ return _get_query_param(QueryParams.OPEN_PROJECT)
 
47
 
48
 
49
  def set_project(project: CurrentProject):
core/state.py CHANGED
@@ -127,6 +127,7 @@ class SelectedRecordSet:
127
  class FileObject:
128
  """FileObject analogue for editor"""
129
 
 
130
  name: str | None = None
131
  description: str | None = None
132
  contained_in: list[str] | None = dataclasses.field(default_factory=list)
@@ -135,7 +136,6 @@ class FileObject:
135
  encoding_format: str | None = None
136
  sha256: str | None = None
137
  df: pd.DataFrame | None = None
138
- rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
139
  folder: epath.PathLike | None = None
140
 
141
 
@@ -143,23 +143,23 @@ class FileObject:
143
  class FileSet:
144
  """FileSet analogue for editor"""
145
 
 
146
  contained_in: list[str] = dataclasses.field(default_factory=list)
147
  description: str | None = None
148
  encoding_format: str | None = ""
149
  includes: str | None = ""
150
  name: str = ""
151
- rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
152
 
153
 
154
  @dataclasses.dataclass
155
  class Field:
156
  """Field analogue for editor"""
157
 
 
158
  name: str | None = None
159
  description: str | None = None
160
  data_types: str | list[str] | None = None
161
  source: mlc.Source | None = None
162
- rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
163
  references: mlc.Source | None = None
164
 
165
 
@@ -167,13 +167,13 @@ class Field:
167
  class RecordSet:
168
  """Record Set analogue for editor"""
169
 
 
170
  name: str = ""
171
  data: list[Any] | None = None
172
  description: str | None = None
173
  is_enumeration: bool | None = None
174
  key: str | list[str] | None = None
175
  fields: list[Field] = dataclasses.field(default_factory=list)
176
- rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
177
 
178
 
179
  @dataclasses.dataclass
@@ -182,9 +182,10 @@ class Metadata:
182
 
183
  name: str = ""
184
  description: str | None = None
185
- citation: str | None = None
186
- conforms_to: str | None = None
187
  creators: list[mlc.PersonOrOrganization] = dataclasses.field(default_factory=list)
 
188
  data_biases: str | None = None
189
  data_collection: str | None = None
190
  date_published: datetime.datetime | None = None
@@ -193,7 +194,6 @@ class Metadata:
193
  url: str = ""
194
  distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
195
  record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
196
- rdf: mlc.Rdf = dataclasses.field(default_factory=mlc.Rdf)
197
  version: str | None = None
198
 
199
  def __bool__(self):
@@ -295,17 +295,20 @@ class Metadata:
295
 
296
  def to_canonical(self) -> mlc.Metadata:
297
  distribution = []
 
298
  for file in self.distribution:
299
  if isinstance(file, FileObject):
300
- distribution.append(create_class(mlc.FileObject, file))
301
  elif isinstance(file, FileSet):
302
- distribution.append(create_class(mlc.FileSet, file))
303
  record_sets = []
304
  for record_set in self.record_sets:
305
  fields = []
306
  for field in record_set.fields:
307
- fields.append(create_class(mlc.Field, field))
308
- record_sets.append(create_class(mlc.RecordSet, record_set, fields=fields))
 
 
309
  return create_class(
310
  mlc.Metadata,
311
  self,
 
127
  class FileObject:
128
  """FileObject analogue for editor"""
129
 
130
+ ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
131
  name: str | None = None
132
  description: str | None = None
133
  contained_in: list[str] | None = dataclasses.field(default_factory=list)
 
136
  encoding_format: str | None = None
137
  sha256: str | None = None
138
  df: pd.DataFrame | None = None
 
139
  folder: epath.PathLike | None = None
140
 
141
 
 
143
  class FileSet:
144
  """FileSet analogue for editor"""
145
 
146
+ ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
147
  contained_in: list[str] = dataclasses.field(default_factory=list)
148
  description: str | None = None
149
  encoding_format: str | None = ""
150
  includes: str | None = ""
151
  name: str = ""
 
152
 
153
 
154
  @dataclasses.dataclass
155
  class Field:
156
  """Field analogue for editor"""
157
 
158
+ ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
159
  name: str | None = None
160
  description: str | None = None
161
  data_types: str | list[str] | None = None
162
  source: mlc.Source | None = None
 
163
  references: mlc.Source | None = None
164
 
165
 
 
167
  class RecordSet:
168
  """Record Set analogue for editor"""
169
 
170
+ ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
171
  name: str = ""
172
  data: list[Any] | None = None
173
  description: str | None = None
174
  is_enumeration: bool | None = None
175
  key: str | list[str] | None = None
176
  fields: list[Field] = dataclasses.field(default_factory=list)
 
177
 
178
 
179
  @dataclasses.dataclass
 
182
 
183
  name: str = ""
184
  description: str | None = None
185
+ cite_as: str | None = None
186
+ context: dict = dataclasses.field(default_factory=dict)
187
  creators: list[mlc.PersonOrOrganization] = dataclasses.field(default_factory=list)
188
+ ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
189
  data_biases: str | None = None
190
  data_collection: str | None = None
191
  date_published: datetime.datetime | None = None
 
194
  url: str = ""
195
  distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
196
  record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
 
197
  version: str | None = None
198
 
199
  def __bool__(self):
 
295
 
296
  def to_canonical(self) -> mlc.Metadata:
297
  distribution = []
298
+ ctx = self.ctx
299
  for file in self.distribution:
300
  if isinstance(file, FileObject):
301
+ distribution.append(create_class(mlc.FileObject, file, ctx=ctx))
302
  elif isinstance(file, FileSet):
303
+ distribution.append(create_class(mlc.FileSet, file, ctx=ctx))
304
  record_sets = []
305
  for record_set in self.record_sets:
306
  fields = []
307
  for field in record_set.fields:
308
+ fields.append(create_class(mlc.Field, field, ctx=ctx))
309
+ record_sets.append(
310
+ create_class(mlc.RecordSet, record_set, ctx=ctx, fields=fields)
311
+ )
312
  return create_class(
313
  mlc.Metadata,
314
  self,
cypress.config.js CHANGED
@@ -3,6 +3,6 @@ const { defineConfig } = require("cypress");
3
  module.exports = defineConfig({
4
  // To access content within Streamlit iframes for custom components:
5
  chromeWebSecurity: false,
6
- defaultCommandTimeout: 10000,
7
  e2e: {},
8
  });
 
3
  module.exports = defineConfig({
4
  // To access content within Streamlit iframes for custom components:
5
  chromeWebSecurity: false,
6
+ defaultCommandTimeout: 20000,
7
  e2e: {},
8
  });
events/metadata.py CHANGED
@@ -92,12 +92,11 @@ class MetadataEvent(enum.Enum):
92
  """Event that triggers a metadata change."""
93
 
94
  NAME = "NAME"
95
- CONFORMS_TO = "CONFORMS_TO"
96
  DESCRIPTION = "DESCRIPTION"
97
  DATE_PUBLISHED = "DATE_PUBLISHED"
98
  URL = "URL"
99
  LICENSE = "LICENSE"
100
- CITATION = "CITATION"
101
  VERSION = "VERSION"
102
  DATA_BIASES = "DATA_BIASES"
103
  DATA_COLLECTION = "DATA_COLLECTION"
@@ -111,14 +110,12 @@ class MetadataEvent(enum.Enum):
111
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
112
  if event == MetadataEvent.NAME:
113
  metadata.name = find_unique_name(set(), st.session_state[key])
114
- if event == MetadataEvent.CONFORMS_TO:
115
- metadata.conforms_to = st.session_state[key]
116
  elif event == MetadataEvent.DESCRIPTION:
117
  metadata.description = st.session_state[key]
118
  elif event == MetadataEvent.LICENSE:
119
  metadata.license = LICENSES.get(st.session_state[key])
120
- elif event == MetadataEvent.CITATION:
121
- metadata.citation = st.session_state[key]
122
  elif event == MetadataEvent.URL:
123
  metadata.url = st.session_state[key]
124
  elif event == MetadataEvent.VERSION:
 
92
  """Event that triggers a metadata change."""
93
 
94
  NAME = "NAME"
 
95
  DESCRIPTION = "DESCRIPTION"
96
  DATE_PUBLISHED = "DATE_PUBLISHED"
97
  URL = "URL"
98
  LICENSE = "LICENSE"
99
+ CITE_AS = "CITE_AS"
100
  VERSION = "VERSION"
101
  DATA_BIASES = "DATA_BIASES"
102
  DATA_COLLECTION = "DATA_COLLECTION"
 
110
  def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
111
  if event == MetadataEvent.NAME:
112
  metadata.name = find_unique_name(set(), st.session_state[key])
 
 
113
  elif event == MetadataEvent.DESCRIPTION:
114
  metadata.description = st.session_state[key]
115
  elif event == MetadataEvent.LICENSE:
116
  metadata.license = LICENSES.get(st.session_state[key])
117
+ elif event == MetadataEvent.CITE_AS:
118
+ metadata.cite_as = st.session_state[key]
119
  elif event == MetadataEvent.URL:
120
  metadata.url = st.session_state[key]
121
  elif event == MetadataEvent.VERSION:
events/resources.py CHANGED
@@ -72,6 +72,6 @@ def _create_instance1_from_instance2(instance1: Resource, instance2: type):
72
  attributes1 = set((field.name for field in dataclasses.fields(instance1)))
73
  attributes2 = set((field.name for field in dataclasses.fields(instance2)))
74
  common_attributes = attributes2.intersection(attributes1)
75
- return instance2(**{
76
- attribute: getattr(instance1, attribute) for attribute in common_attributes
77
- })
 
72
  attributes1 = set((field.name for field in dataclasses.fields(instance1)))
73
  attributes2 = set((field.name for field in dataclasses.fields(instance2)))
74
  common_attributes = attributes2.intersection(attributes1)
75
+ return instance2(
76
+ **{attribute: getattr(instance1, attribute) for attribute in common_attributes}
77
+ )
views/jsonld.py CHANGED
@@ -47,7 +47,7 @@ def render_jsonld():
47
  if croissant.metadata:
48
  metadata = mlc.Metadata(
49
  name=croissant.metadata.name,
50
- citation=croissant.metadata.citation,
51
  license=croissant.metadata.license,
52
  description=croissant.metadata.description,
53
  url=croissant.metadata.url,
 
47
  if croissant.metadata:
48
  metadata = mlc.Metadata(
49
  name=croissant.metadata.name,
50
+ cite_as=croissant.metadata.cite_as,
51
  license=croissant.metadata.license,
52
  description=croissant.metadata.description,
53
  url=croissant.metadata.url,
views/metadata.py CHANGED
@@ -97,14 +97,14 @@ def _render_generic_metadata(metadata: Metadata):
97
  on_change=handle_metadata_change,
98
  args=(MetadataEvent.LICENSE, metadata, key),
99
  )
100
- key = "metadata-citation"
101
  st.text_area(
102
  label="Citation",
103
  key=key,
104
- value=metadata.citation,
105
  placeholder="@book{\n title={Title}\n}",
106
  on_change=handle_metadata_change,
107
- args=(MetadataEvent.CITATION, metadata, key),
108
  )
109
  key = "metadata-date-published"
110
  st.date_input(
 
97
  on_change=handle_metadata_change,
98
  args=(MetadataEvent.LICENSE, metadata, key),
99
  )
100
+ key = "metadata-cite-as"
101
  st.text_area(
102
  label="Citation",
103
  key=key,
104
+ value=metadata.cite_as,
105
  placeholder="@book{\n title={Title}\n}",
106
  on_change=handle_metadata_change,
107
+ args=(MetadataEvent.CITE_AS, metadata, key),
108
  )
109
  key = "metadata-date-published"
110
  st.date_input(
views/overview.py CHANGED
@@ -10,7 +10,7 @@ from utils import needed_field
10
  from views.metadata import handle_metadata_change
11
  from views.metadata import MetadataEvent
12
 
13
- _NON_RELEVANT_METADATA = ["name", "distribution", "record_sets", "rdf"]
14
 
15
  _INFO_TEXT = """Croissant files are composed of three layers:
16
 
@@ -98,7 +98,7 @@ def render_overview():
98
  if user_started_editing:
99
  warning = ""
100
  try:
101
- issues = metadata.to_canonical().issues
102
  if issues.errors:
103
  warning += "**Errors**\n"
104
  for error in issues.errors:
 
10
  from views.metadata import handle_metadata_change
11
  from views.metadata import MetadataEvent
12
 
13
+ _NON_RELEVANT_METADATA = ["ctx", "name", "distribution", "record_sets"]
14
 
15
  _INFO_TEXT = """Croissant files are composed of three layers:
16
 
 
98
  if user_started_editing:
99
  warning = ""
100
  try:
101
+ issues = metadata.to_canonical().ctx.issues
102
  if issues.errors:
103
  warning += "**Errors**\n"
104
  for error in issues.errors:
views/record_sets.py CHANGED
@@ -44,11 +44,9 @@ class _Result(TypedDict):
44
  @st.cache_data(
45
  show_spinner="Generating the dataset...",
46
  hash_funcs={
47
- "mlcroissant.Metadata": hash,
48
- "mlcroissant.Field": hash,
49
- "mlcroissant.FileObject": hash,
50
- "mlcroissant.FileSet": hash,
51
- "mlcroissant.RecordSet": hash,
52
  },
53
  )
54
  def _generate_data_with_timeout(record_set: RecordSet) -> _Result:
@@ -385,11 +383,13 @@ def _render_left_panel():
385
  "⚠️",
386
  key=f"idea-{prefix}",
387
  on_click=lambda: _generate_data_with_timeout.clear(),
388
- help=textwrap.dedent(f"""**Error**:
 
389
  ```
390
  {exception}
391
  ```
392
- """),
 
393
  )
394
  right.markdown("No preview is possible.")
395
 
 
44
  @st.cache_data(
45
  show_spinner="Generating the dataset...",
46
  hash_funcs={
47
+ "core.state.RecordSet": lambda record_set: hash(
48
+ (record_set.name, record_set.description)
49
+ ),
 
 
50
  },
51
  )
52
  def _generate_data_with_timeout(record_set: RecordSet) -> _Result:
 
383
  "⚠️",
384
  key=f"idea-{prefix}",
385
  on_click=lambda: _generate_data_with_timeout.clear(),
386
+ help=textwrap.dedent(
387
+ f"""**Error**:
388
  ```
389
  {exception}
390
  ```
391
+ """
392
+ ),
393
  )
394
  right.markdown("No preview is possible.")
395
 
views/record_sets_test.py CHANGED
@@ -19,10 +19,12 @@ def test_find_joins():
19
  references=mlc.Source(uid="some_other_record_set/some_other_field"),
20
  ),
21
  ]
22
- assert _find_joins(fields) == set([
23
- (("some_csv", "some_column"), ("some_record_set", "some_field")),
24
- (
25
- ("some_record_set", "some_field"),
26
- ("some_other_record_set", "some_other_field"),
27
- ),
28
- ])
 
 
 
19
  references=mlc.Source(uid="some_other_record_set/some_other_field"),
20
  ),
21
  ]
22
+ assert _find_joins(fields) == set(
23
+ [
24
+ (("some_csv", "some_column"), ("some_record_set", "some_field")),
25
+ (
26
+ ("some_record_set", "some_field"),
27
+ ("some_other_record_set", "some_other_field"),
28
+ ),
29
+ ]
30
+ )
views/splash.py CHANGED
@@ -65,11 +65,11 @@ def render_splash():
65
  with st.expander("**Load an existing dataset**", expanded=True):
66
 
67
  def create_example(dataset: str):
68
- base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
69
  url = f"{base}/metadata.json"
70
  try:
71
  json = requests.get(url).json()
72
- metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
73
  st.session_state[Metadata] = Metadata.from_canonical(metadata)
74
  save_current_project()
75
  # Write supplementary files.
 
65
  with st.expander("**Load an existing dataset**", expanded=True):
66
 
67
  def create_example(dataset: str):
68
+ base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/1.0/{dataset.lower()}"
69
  url = f"{base}/metadata.json"
70
  try:
71
  json = requests.get(url).json()
72
+ metadata = mlc.Metadata.from_json(mlc.Context(), json)
73
  st.session_state[Metadata] = Metadata.from_canonical(metadata)
74
  save_current_project()
75
  # Write supplementary files.