marcenacp commited on
Commit
73ebcab
1 Parent(s): 5b216e9

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
Makefile CHANGED
@@ -1,7 +1,9 @@
 
 
1
  black:
2
- black \
3
  --line-length 88 \
4
- --preview \
5
  .
6
 
7
  isort:
 
1
+ current_dir := $(dir $(abspath $(firstword $(MAKEFILE_LIST))))
2
+
3
  black:
4
+ docker run --rm --volume $(current_dir):/src --workdir /src pyfound/black:24.2.0 black \
5
  --line-length 88 \
6
+ --exclude '.*\/node_modules\/' \
7
  .
8
 
9
  isort:
core/files.py CHANGED
@@ -52,9 +52,12 @@ class FileTypes:
52
  encoding_format="application/x-tar",
53
  extensions=["tar"],
54
  )
 
 
 
55
  TXT = FileType(
56
  name="Text",
57
- encoding_format="plain/text",
58
  extensions=["txt"],
59
  )
60
  ZIP = FileType(
@@ -79,6 +82,7 @@ FILE_TYPES: dict[str, FileType] = {
79
  FileTypes.JSONL,
80
  FileTypes.PARQUET,
81
  FileTypes.TAR,
 
82
  FileTypes.TXT,
83
  FileTypes.ZIP,
84
  ]
@@ -141,6 +145,8 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
141
  df = pd.read_json(file, lines=True)
142
  elif file_type == FileTypes.PARQUET:
143
  df = pd.read_parquet(file)
 
 
144
  else:
145
  raise NotImplementedError(
146
  f"File type {file_type} is not supported. Please, open an issue on GitHub:"
@@ -149,8 +155,22 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
149
  return df.infer_objects()
150
 
151
 
152
- def guess_file_type(path: epath.Path) -> FileType | None:
 
153
  mime = magic.from_file(path, mime=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  return ENCODING_FORMATS.get(mime)
155
 
156
 
@@ -163,8 +183,10 @@ def file_from_url(url: str, names: set[str], folder: epath.Path) -> FileObject:
163
  sha256 = _sha256(file.read())
164
  file_type = guess_file_type(file_path)
165
  df = get_dataframe(file_type, file_path)
 
166
  return FileObject(
167
- name=find_unique_name(names, url.split("/")[-1]),
 
168
  description="",
169
  content_url=url,
170
  encoding_format=file_type.encoding_format,
@@ -186,8 +208,10 @@ def file_from_upload(
186
  f.write(value)
187
  file_type = guess_file_type(file_path)
188
  df = get_dataframe(file_type, file)
 
189
  return FileObject(
190
- name=find_unique_name(names, file.name),
 
191
  description="",
192
  content_url=content_url,
193
  encoding_format=file_type.encoding_format,
@@ -202,9 +226,11 @@ def file_from_form(
202
  ) -> FileObject | FileSet:
203
  """Creates a file based on manually added fields."""
204
  if type == FILE_OBJECT:
205
- return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
 
206
  elif type == FILE_SET:
207
- return FileSet(name=find_unique_name(names, "file_set"))
 
208
  else:
209
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
210
 
 
52
  encoding_format="application/x-tar",
53
  extensions=["tar"],
54
  )
55
+ TSV = FileType(
56
+ name="TSV", encoding_format="text/tab-separated-values", extensions=["tsv"]
57
+ )
58
  TXT = FileType(
59
  name="Text",
60
+ encoding_format="text/plain",
61
  extensions=["txt"],
62
  )
63
  ZIP = FileType(
 
82
  FileTypes.JSONL,
83
  FileTypes.PARQUET,
84
  FileTypes.TAR,
85
+ FileTypes.TSV,
86
  FileTypes.TXT,
87
  FileTypes.ZIP,
88
  ]
 
145
  df = pd.read_json(file, lines=True)
146
  elif file_type == FileTypes.PARQUET:
147
  df = pd.read_parquet(file)
148
+ elif file_type == FileTypes.TSV:
149
+ df = pd.read_csv(file, sep="\t")
150
  else:
151
  raise NotImplementedError(
152
  f"File type {file_type} is not supported. Please, open an issue on GitHub:"
 
155
  return df.infer_objects()
156
 
157
 
158
+ def _guess_mime_type(path: epath.Path) -> str:
159
+ """Guess most specific MIME type."""
160
  mime = magic.from_file(path, mime=True)
161
+ extension = path.suffix
162
+ if mime == "text/plain":
163
+ # In some cases, a CSV/TSV may be classified as text
164
+ # For example, if the file is not terminated by a newline
165
+ if extension == ".csv":
166
+ mime = "text/csv"
167
+ elif extension == ".tsv":
168
+ mime = "text/tab-separated-values"
169
+ return mime
170
+
171
+
172
+ def guess_file_type(path: epath.Path) -> FileType | None:
173
+ mime = _guess_mime_type(path)
174
  return ENCODING_FORMATS.get(mime)
175
 
176
 
 
183
  sha256 = _sha256(file.read())
184
  file_type = guess_file_type(file_path)
185
  df = get_dataframe(file_type, file_path)
186
+ name = find_unique_name(names, url.split("/")[-1])
187
  return FileObject(
188
+ id=name,
189
+ name=name,
190
  description="",
191
  content_url=url,
192
  encoding_format=file_type.encoding_format,
 
208
  f.write(value)
209
  file_type = guess_file_type(file_path)
210
  df = get_dataframe(file_type, file)
211
+ name = find_unique_name(names, file.name)
212
  return FileObject(
213
+ id=name,
214
+ name=name,
215
  description="",
216
  content_url=content_url,
217
  encoding_format=file_type.encoding_format,
 
226
  ) -> FileObject | FileSet:
227
  """Creates a file based on manually added fields."""
228
  if type == FILE_OBJECT:
229
+ name = find_unique_name(names, "file_object")
230
+ return FileObject(id=name, name=name, folder=folder)
231
  elif type == FILE_SET:
232
+ name = find_unique_name(names, "file_set")
233
+ return FileSet(id=name, name=name)
234
  else:
235
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
236
 
core/files_test.py CHANGED
@@ -10,12 +10,13 @@ FileTypes = files_module.FileTypes
10
 
11
 
12
  @mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
13
- def test_check_file_csv(guess_file_type):
14
  del guess_file_type
15
  csv = epath.Path(
16
  # This is the hash path for "https://my.url".
17
  "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
18
  )
 
19
  if csv.exists():
20
  csv.unlink()
21
  with csv.open("w") as f:
@@ -28,6 +29,51 @@ def test_check_file_csv(guess_file_type):
28
  file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
29
  )
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  @mock.patch.object(files_module, "guess_file_type", return_value="unknown")
33
  def test_check_file_unknown(guess_file_type):
 
10
 
11
 
12
  @mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
13
+ def test_check_file_csv_url(guess_file_type):
14
  del guess_file_type
15
  csv = epath.Path(
16
  # This is the hash path for "https://my.url".
17
  "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
18
  )
19
+ # Test unescaped CSV
20
  if csv.exists():
21
  csv.unlink()
22
  with csv.open("w") as f:
 
29
  file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
30
  )
31
 
32
+ # Test error thrown on no file
33
+ csv.unlink()
34
+ with pytest.raises(Exception):
35
+ files_module.file_from_url("https://my.url", set(), epath.Path())
36
+
37
+ # Test escaped CSV
38
+ content = b'"This","Is"\n1,2\n3,4'
39
+ with csv.open("wb") as f:
40
+ f.write(content)
41
+ file = files_module.file_from_url("https://my.url", set(), epath.Path())
42
+ pd.testing.assert_frame_equal(file.df, pd.DataFrame({"This": [1, 3], "Is": [2, 4]}))
43
+
44
+
45
+ @mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.TSV)
46
+ def test_check_file_tsv_url(guess_file_type):
47
+ del guess_file_type
48
+ tsv = epath.Path(
49
+ # This is the hash path for "https://my.url".
50
+ "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
51
+ )
52
+ # Test unescaped CSV
53
+ if tsv.exists():
54
+ tsv.unlink()
55
+ with tsv.open("w") as f:
56
+ f.write("column1\tcolumn2\n")
57
+ f.write("a\t1\n")
58
+ f.write("b\t2\n")
59
+ f.write("c\t3\n")
60
+ file = files_module.file_from_url("https://my.url", set(), epath.Path())
61
+ pd.testing.assert_frame_equal(
62
+ file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
63
+ )
64
+
65
+ # Test error thrown on no file
66
+ tsv.unlink()
67
+ with pytest.raises(Exception):
68
+ files_module.file_from_url("https://my.url", set(), epath.Path())
69
+
70
+ # Test escaped TSV
71
+ content = b'"This"\t"Is"\n1\t2\n3\t4'
72
+ with tsv.open("wb") as f:
73
+ f.write(content)
74
+ file = files_module.file_from_url("https://my.url", set(), epath.Path())
75
+ pd.testing.assert_frame_equal(file.df, pd.DataFrame({"This": [1, 3], "Is": [2, 4]}))
76
+
77
 
78
  @mock.patch.object(files_module, "guess_file_type", return_value="unknown")
79
  def test_check_file_unknown(guess_file_type):
core/record_sets.py CHANGED
@@ -18,21 +18,23 @@ def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[Recor
18
  fields = []
19
  for column, value in file.df.dtypes.items():
20
  source = mlc.Source(
21
- uid=file.name,
22
- node_type="distribution",
23
  extract=mlc.Extract(column=column),
24
  )
25
  field = Field(
 
26
  name=column,
27
  data_types=[convert_dtype(value)],
28
  source=source,
29
  references=mlc.Source(),
30
  )
31
  fields.append(field)
 
32
  return [
33
  RecordSet(
 
34
  fields=fields,
35
- name=find_unique_name(names, file.name + "_record_set"),
36
  description="",
37
  )
38
  ]
 
18
  fields = []
19
  for column, value in file.df.dtypes.items():
20
  source = mlc.Source(
21
+ distribution=file.id,
 
22
  extract=mlc.Extract(column=column),
23
  )
24
  field = Field(
25
+ id=column,
26
  name=column,
27
  data_types=[convert_dtype(value)],
28
  source=source,
29
  references=mlc.Source(),
30
  )
31
  fields.append(field)
32
+ name = find_unique_name(names, file.name + "_record_set")
33
  return [
34
  RecordSet(
35
+ id=name,
36
  fields=fields,
37
+ name=name,
38
  description="",
39
  )
40
  ]
core/state.py CHANGED
@@ -9,6 +9,7 @@ import base64
9
  import dataclasses
10
  import datetime
11
  from typing import Any
 
12
 
13
  from etils import epath
14
  import pandas as pd
@@ -33,9 +34,6 @@ def create_class(mlc_class: type, instance: Any, **kwargs) -> Any:
33
  name = field.name
34
  if hasattr(instance, name) and name not in kwargs:
35
  params[name] = getattr(instance, name)
36
- if "uuid" in params and params.get("uuid") is None:
37
- # Let mlcroissant handle the default value
38
- del params["uuid"]
39
  return mlc_class(**params, **kwargs)
40
 
41
 
@@ -127,11 +125,22 @@ class SelectedRecordSet:
127
 
128
 
129
  @dataclasses.dataclass
130
- class FileObject:
131
- """FileObject analogue for editor"""
132
-
133
  ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
 
134
  name: str | None = None
 
 
 
 
 
 
 
 
 
 
 
 
135
  description: str | None = None
136
  contained_in: list[str] | None = dataclasses.field(default_factory=list)
137
  content_size: str | None = None
@@ -140,65 +149,52 @@ class FileObject:
140
  sha256: str | None = None
141
  df: pd.DataFrame | None = None
142
  folder: epath.PathLike | None = None
143
- uuid: str | None = None
144
 
145
 
146
  @dataclasses.dataclass
147
- class FileSet:
148
  """FileSet analogue for editor"""
149
 
150
- ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
151
  contained_in: list[str] = dataclasses.field(default_factory=list)
152
  description: str | None = None
153
  encoding_format: str | None = ""
154
  includes: str | None = ""
155
- name: str = ""
156
- uuid: str | None = None
157
 
158
 
159
  @dataclasses.dataclass
160
- class Field:
161
  """Field analogue for editor"""
162
 
163
- ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
164
- name: str | None = None
165
  description: str | None = None
166
  data_types: str | list[str] | None = None
167
  source: mlc.Source | None = None
168
  references: mlc.Source | None = None
169
- uuid: str | None = None
170
 
171
 
172
  @dataclasses.dataclass
173
- class RecordSet:
174
  """Record Set analogue for editor"""
175
 
176
- ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
177
- name: str = ""
178
  data: list[Any] | None = None
 
179
  description: str | None = None
180
  is_enumeration: bool | None = None
181
  key: str | list[str] | None = None
182
  fields: list[Field] = dataclasses.field(default_factory=list)
183
- uuid: str | None = None
184
 
185
 
186
  @dataclasses.dataclass
187
- class Metadata:
188
  """main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""
189
 
190
- name: str = ""
191
  description: str | None = None
192
  cite_as: str | None = None
193
- context: dict = dataclasses.field(default_factory=dict)
194
- creators: list[mlc.PersonOrOrganization] = dataclasses.field(default_factory=list)
195
- ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
196
  data_biases: str | None = None
197
  data_collection: str | None = None
198
  date_published: datetime.datetime | None = None
199
  license: str | None = ""
200
  personal_sensitive_information: str | None = None
201
- uuid: str | None = None
202
  url: str = ""
203
  distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
204
  record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
@@ -211,6 +207,8 @@ class Metadata:
211
  """Renames a resource by changing all the references to this resource."""
212
  # Update other resources:
213
  for i, resource in enumerate(self.distribution):
 
 
214
  contained_in = resource.contained_in
215
  if contained_in and old_name in contained_in:
216
  self.distribution[i].contained_in = [
@@ -222,55 +220,89 @@ class Metadata:
222
  def rename_record_set(self, old_name: str, new_name: str):
223
  """Renames a RecordSet by changing all the references to this RecordSet."""
224
  for i, record_set in enumerate(self.record_sets):
 
 
225
  for j, field in enumerate(record_set.fields):
226
- possible_uid = f"{old_name}/"
227
  # Update source
228
  source = field.source
229
- if (
230
- source
231
- and source.uid
232
- and (source.uid.startswith(possible_uid) or source.uid == old_name)
233
- ):
234
- new_uid = source.uid.replace(old_name, new_name, 1)
235
- self.record_sets[i].fields[j].source.uid = new_uid
 
 
236
  # Update references
237
  references = field.references
238
  if (
239
  references
240
- and references.uid
241
- and (
242
- references.uid.startswith(possible_uid)
243
- or references.uid == old_name
244
- )
245
  ):
246
- new_uid = references.uid.replace(old_name, new_name, 1)
247
- self.record_sets[i].fields[j].references.uid = new_uid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
  def rename_field(self, old_name: str, new_name: str):
250
  """Renames a field by changing all the references to this field."""
251
  for i, record_set in enumerate(self.record_sets):
252
  for j, field in enumerate(record_set.fields):
 
253
  # Update source
254
  source = field.source
255
  # The difference with RecordSet is the `.endswith` here:
256
- if (
257
- source
258
- and source.uid
259
- and "/" in source.uid
260
- and source.uid.endswith(old_name)
261
- ):
262
- new_uid = source.uid.replace(old_name, new_name, 1)
263
- self.record_sets[i].fields[j].source.uid = new_uid
264
  # Update references
265
  references = field.references
266
  if (
267
  references
268
- and references.uid
269
- and "/" in references.uid
270
- and references.uid.endswith(old_name)
271
  ):
272
- new_uid = references.uid.replace(old_name, new_name, 1)
273
- self.record_sets[i].fields[j].references.uid = new_uid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  def add_distribution(self, distribution: FileSet | FileObject) -> None:
276
  self.distribution.append(distribution)
@@ -352,8 +384,16 @@ class Metadata:
352
  )
353
 
354
  def names(self) -> set[str]:
355
- nodes = self.distribution + self.record_sets
356
- return set([node.name for node in nodes])
 
 
 
 
 
 
 
 
357
 
358
 
359
  class OpenTab:
 
9
  import dataclasses
10
  import datetime
11
  from typing import Any
12
+ import uuid
13
 
14
  from etils import epath
15
  import pandas as pd
 
34
  name = field.name
35
  if hasattr(instance, name) and name not in kwargs:
36
  params[name] = getattr(instance, name)
 
 
 
37
  return mlc_class(**params, **kwargs)
38
 
39
 
 
125
 
126
 
127
  @dataclasses.dataclass
128
+ class Node:
 
 
129
  ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
130
+ id: str | None = None
131
  name: str | None = None
132
+
133
+ def get_name_or_id(self):
134
+ if self.ctx.is_v0():
135
+ return self.name
136
+ else:
137
+ return self.id
138
+
139
+
140
+ @dataclasses.dataclass
141
+ class FileObject(Node):
142
+ """FileObject analogue for editor"""
143
+
144
  description: str | None = None
145
  contained_in: list[str] | None = dataclasses.field(default_factory=list)
146
  content_size: str | None = None
 
149
  sha256: str | None = None
150
  df: pd.DataFrame | None = None
151
  folder: epath.PathLike | None = None
 
152
 
153
 
154
  @dataclasses.dataclass
155
+ class FileSet(Node):
156
  """FileSet analogue for editor"""
157
 
 
158
  contained_in: list[str] = dataclasses.field(default_factory=list)
159
  description: str | None = None
160
  encoding_format: str | None = ""
161
  includes: str | None = ""
 
 
162
 
163
 
164
  @dataclasses.dataclass
165
+ class Field(Node):
166
  """Field analogue for editor"""
167
 
 
 
168
  description: str | None = None
169
  data_types: str | list[str] | None = None
170
  source: mlc.Source | None = None
171
  references: mlc.Source | None = None
 
172
 
173
 
174
  @dataclasses.dataclass
175
+ class RecordSet(Node):
176
  """Record Set analogue for editor"""
177
 
 
 
178
  data: list[Any] | None = None
179
+ data_types: list[str] | None = None
180
  description: str | None = None
181
  is_enumeration: bool | None = None
182
  key: str | list[str] | None = None
183
  fields: list[Field] = dataclasses.field(default_factory=list)
 
184
 
185
 
186
  @dataclasses.dataclass
187
+ class Metadata(Node):
188
  """main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""
189
 
 
190
  description: str | None = None
191
  cite_as: str | None = None
192
+ creators: list[mlc.Person] = dataclasses.field(default_factory=list)
 
 
193
  data_biases: str | None = None
194
  data_collection: str | None = None
195
  date_published: datetime.datetime | None = None
196
  license: str | None = ""
197
  personal_sensitive_information: str | None = None
 
198
  url: str = ""
199
  distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
200
  record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
 
207
  """Renames a resource by changing all the references to this resource."""
208
  # Update other resources:
209
  for i, resource in enumerate(self.distribution):
210
+ if resource.id == old_name:
211
+ self.distribution[i].id = new_name
212
  contained_in = resource.contained_in
213
  if contained_in and old_name in contained_in:
214
  self.distribution[i].contained_in = [
 
220
  def rename_record_set(self, old_name: str, new_name: str):
221
  """Renames a RecordSet by changing all the references to this RecordSet."""
222
  for i, record_set in enumerate(self.record_sets):
223
+ if record_set.id == old_name:
224
+ self.record_sets[i].id = new_name
225
  for j, field in enumerate(record_set.fields):
226
+ possible_uuid = f"{old_name}/"
227
  # Update source
228
  source = field.source
229
+ if source and source.field and source.field.startswith(possible_uuid):
230
+ new_uuid = source.field.replace(old_name, new_name, 1)
231
+ self.record_sets[i].fields[j].source.field = new_uuid
232
+ if source and source.file_object and source.file_object == old_name:
233
+ self.record_sets[i].fields[j].source.file_object = new_name
234
+ if source and source.file_set and source.file_set == old_name:
235
+ self.record_sets[i].fields[j].source.file_set = new_name
236
+ if source and source.distribution and source.distribution == old_name:
237
+ self.record_sets[i].fields[j].source.distribution = new_name
238
  # Update references
239
  references = field.references
240
  if (
241
  references
242
+ and references.field
243
+ and references.field.startswith(possible_uuid)
 
 
 
244
  ):
245
+ new_uuid = references.field.replace(old_name, new_name, 1)
246
+ self.record_sets[i].fields[j].references.field = new_uuid
247
+ if (
248
+ references
249
+ and references.file_object
250
+ and references.file_object == old_name
251
+ ):
252
+ self.record_sets[i].fields[j].references.file_object = new_name
253
+ if (
254
+ references
255
+ and references.file_set
256
+ and references.file_set == old_name
257
+ ):
258
+ self.record_sets[i].fields[j].references.file_set = new_name
259
+ if (
260
+ references
261
+ and references.distribution
262
+ and references.distribution == old_name
263
+ ):
264
+ self.record_sets[i].fields[j].references.distribution = new_name
265
 
266
  def rename_field(self, old_name: str, new_name: str):
267
  """Renames a field by changing all the references to this field."""
268
  for i, record_set in enumerate(self.record_sets):
269
  for j, field in enumerate(record_set.fields):
270
+ possible_uuid = f"/{old_name}"
271
  # Update source
272
  source = field.source
273
  # The difference with RecordSet is the `.endswith` here:
274
+ if source and source.field and source.field.endswith(possible_uuid):
275
+ new_uuid = source.field.replace(old_name, new_name, 1)
276
+ self.record_sets[i].fields[j].source.field = new_uuid
 
 
 
 
 
277
  # Update references
278
  references = field.references
279
  if (
280
  references
281
+ and references.field
282
+ and references.field.endswith(possible_uuid)
 
283
  ):
284
+ new_uuid = references.field.replace(old_name, new_name, 1)
285
+ self.record_sets[i].fields[j].references.field = new_uuid
286
+
287
+ def rename_id(self, old_id: str, new_id: str):
288
+ for resource in self.distribution:
289
+ if resource.id == old_id:
290
+ resource.id = new_id
291
+ if resource.contained_in and old_id in resource.contained_in:
292
+ resource.contained_in = [
293
+ new_id if uuid == old_id else uuid for uuid in resource.contained_in
294
+ ]
295
+ for record_set in self.record_sets:
296
+ if record_set.id == old_id:
297
+ record_set.id = new_id
298
+ for field in record_set.fields:
299
+ if field.id == old_id:
300
+ field.id = new_id
301
+ for p in ["distribution", "field", "file_object", "file_set"]:
302
+ if field.source and getattr(field.source, p) == old_id:
303
+ setattr(field.source, p, new_id)
304
+ if field.references and getattr(field.references, p) == old_id:
305
+ setattr(field.references, p, new_id)
306
 
307
  def add_distribution(self, distribution: FileSet | FileObject) -> None:
308
  self.distribution.append(distribution)
 
384
  )
385
 
386
  def names(self) -> set[str]:
387
+ distribution = set()
388
+ record_sets = set()
389
+ fields = set()
390
+ for resource in self.distribution:
391
+ distribution.add(resource.get_name_or_id())
392
+ for record_set in self.record_sets:
393
+ record_sets.add(record_set.get_name_or_id())
394
+ for field in record_set.fields:
395
+ fields.add(field.get_name_or_id())
396
+ return distribution.union(record_sets).union(fields)
397
 
398
 
399
  class OpenTab:
core/state_test.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for state."""
2
+
3
+ from etils import epath
4
+
5
+ import mlcroissant as mlc
6
+
7
+ from .state import Metadata
8
+
9
+
10
+ def test_rename_record_set():
11
+ ctx = mlc.Context()
12
+ path = epath.Path(__file__).parent.parent / "cypress/fixtures/1.0/titanic.json"
13
+ canonical_metadata = mlc.Metadata.from_file(ctx, path)
14
+ metadata = Metadata.from_canonical(canonical_metadata)
15
+
16
+ # Rename RecordSet:
17
+ assert metadata.record_sets[0].id == "genders"
18
+ assert metadata.record_sets[2].fields[1].id == "passengers/gender"
19
+ assert metadata.record_sets[2].fields[1].references.field == "genders/label"
20
+ metadata.rename_record_set("genders", "NEW_GENDERS")
21
+ assert metadata.record_sets[0].id == "NEW_GENDERS"
22
+ assert metadata.record_sets[2].fields[1].references.field == "NEW_GENDERS/label"
23
+
24
+ # Rename Field:
25
+ metadata.rename_field("label", "NEW_LABEL")
26
+ assert metadata.record_sets[2].fields[1].references.field == "NEW_GENDERS/NEW_LABEL"
27
+
28
+ # Rename Distribution:
29
+ assert metadata.record_sets[2].fields[0].id == "passengers/name"
30
+ assert metadata.record_sets[2].fields[0].source.file_object == "passengers.csv"
31
+ metadata.rename_distribution("passengers.csv", "NEW_PASSENGERS.CSV")
32
+ assert metadata.record_sets[2].fields[0].source.file_object == "NEW_PASSENGERS.CSV"
events/fields.py CHANGED
@@ -58,6 +58,7 @@ class FieldEvent(enum.Enum):
58
  """Event that triggers a field change."""
59
 
60
  NAME = "NAME"
 
61
  DESCRIPTION = "DESCRIPTION"
62
  DATA_TYPE = "DATA_TYPE"
63
  SOURCE = "SOURCE"
@@ -86,13 +87,20 @@ def handle_field_change(
86
  metadata: Metadata = st.session_state[Metadata]
87
  metadata.rename_field(old_name=old_name, new_name=new_name)
88
  field.name = value
 
 
 
 
 
 
89
  elif change == FieldEvent.DESCRIPTION:
90
  field.description = value
91
  elif change == FieldEvent.DATA_TYPE:
92
  field.data_types = [str_to_mlc_data_type(value)]
93
  elif change == FieldEvent.SOURCE:
94
- node_type = "field" if "/" in value else "distribution"
95
- source = mlc.Source(uid=value, node_type=node_type)
 
96
  field.source = source
97
  elif change == FieldEvent.SOURCE_EXTRACT:
98
  source = field.source
@@ -131,8 +139,9 @@ def handle_field_change(
131
  if number is not None and number < len(field.source.transforms):
132
  field.source.transforms[number] = mlc.Transform(separator=value)
133
  elif change == FieldEvent.REFERENCE:
134
- node_type = "field" if "/" in value else "distribution"
135
- source = mlc.Source(uid=value, node_type=node_type)
 
136
  field.references = source
137
  elif change == FieldEvent.REFERENCE_EXTRACT:
138
  source = field.references
 
58
  """Event that triggers a field change."""
59
 
60
  NAME = "NAME"
61
+ ID = "ID"
62
  DESCRIPTION = "DESCRIPTION"
63
  DATA_TYPE = "DATA_TYPE"
64
  SOURCE = "SOURCE"
 
87
  metadata: Metadata = st.session_state[Metadata]
88
  metadata.rename_field(old_name=old_name, new_name=new_name)
89
  field.name = value
90
+ elif change == FieldEvent.ID:
91
+ old_id = field.id
92
+ new_id = value
93
+ if old_id != new_id:
94
+ metadata: Metadata = st.session_state[Metadata]
95
+ metadata.rename_id(old_id=old_id, new_id=new_id)
96
  elif change == FieldEvent.DESCRIPTION:
97
  field.description = value
98
  elif change == FieldEvent.DATA_TYPE:
99
  field.data_types = [str_to_mlc_data_type(value)]
100
  elif change == FieldEvent.SOURCE:
101
+ source = (
102
+ mlc.Source(field=value) if "/" in value else mlc.Source(file_object=value)
103
+ )
104
  field.source = source
105
  elif change == FieldEvent.SOURCE_EXTRACT:
106
  source = field.source
 
139
  if number is not None and number < len(field.source.transforms):
140
  field.source.transforms[number] = mlc.Transform(separator=value)
141
  elif change == FieldEvent.REFERENCE:
142
+ source = (
143
+ mlc.Source(field=value) if "/" in value else mlc.Source(file_object=value)
144
+ )
145
  field.references = source
146
  elif change == FieldEvent.REFERENCE_EXTRACT:
147
  source = field.references
events/metadata.py CHANGED
@@ -130,16 +130,16 @@ def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
130
  date = st.session_state[key]
131
  metadata.date_published = datetime.datetime(date.year, date.month, date.day)
132
  elif event == MetadataEvent.CREATOR_ADD:
133
- metadata.creators = [mlc.PersonOrOrganization()]
134
  elif event == MetadataEvent.CREATOR_REMOVE:
135
  metadata.creators = []
136
  elif event == MetadataEvent.CREATOR_NAME:
137
  if metadata.creators:
138
  metadata.creators[0].name = st.session_state[key]
139
  else:
140
- metadata.creators = [mlc.PersonOrOrganization(name=st.session_state[key])]
141
  elif event == MetadataEvent.CREATOR_URL:
142
  if metadata.creators:
143
  metadata.creators[0].url = st.session_state[key]
144
  else:
145
- metadata.creators = [mlc.PersonOrOrganization(url=st.session_state[key])]
 
130
  date = st.session_state[key]
131
  metadata.date_published = datetime.datetime(date.year, date.month, date.day)
132
  elif event == MetadataEvent.CREATOR_ADD:
133
+ metadata.creators = [mlc.Person()]
134
  elif event == MetadataEvent.CREATOR_REMOVE:
135
  metadata.creators = []
136
  elif event == MetadataEvent.CREATOR_NAME:
137
  if metadata.creators:
138
  metadata.creators[0].name = st.session_state[key]
139
  else:
140
+ metadata.creators = [mlc.Person(name=st.session_state[key])]
141
  elif event == MetadataEvent.CREATOR_URL:
142
  if metadata.creators:
143
  metadata.creators[0].url = st.session_state[key]
144
  else:
145
+ metadata.creators = [mlc.Person(url=st.session_state[key])]
events/record_sets.py CHANGED
@@ -11,6 +11,7 @@ class RecordSetEvent(enum.Enum):
11
  """Event that triggers a RecordSet change."""
12
 
13
  NAME = "NAME"
 
14
  DESCRIPTION = "DESCRIPTION"
15
  IS_ENUMERATION = "IS_ENUMERATION"
16
  HAS_DATA = "HAS_DATA"
@@ -26,6 +27,12 @@ def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key:
26
  metadata: Metadata = st.session_state[Metadata]
27
  metadata.rename_record_set(old_name=old_name, new_name=new_name)
28
  record_set.name = value
 
 
 
 
 
 
29
  elif event == RecordSetEvent.DESCRIPTION:
30
  record_set.description = value
31
  elif event == RecordSetEvent.IS_ENUMERATION:
 
11
  """Event that triggers a RecordSet change."""
12
 
13
  NAME = "NAME"
14
+ ID = "ID"
15
  DESCRIPTION = "DESCRIPTION"
16
  IS_ENUMERATION = "IS_ENUMERATION"
17
  HAS_DATA = "HAS_DATA"
 
27
  metadata: Metadata = st.session_state[Metadata]
28
  metadata.rename_record_set(old_name=old_name, new_name=new_name)
29
  record_set.name = value
30
+ elif event == RecordSetEvent.ID:
31
+ old_id = record_set.id
32
+ new_id = value
33
+ if old_id != new_id:
34
+ metadata: Metadata = st.session_state[Metadata]
35
+ metadata.rename_id(old_id=old_id, new_id=new_id)
36
  elif event == RecordSetEvent.DESCRIPTION:
37
  record_set.description = value
38
  elif event == RecordSetEvent.IS_ENUMERATION:
events/resources.py CHANGED
@@ -17,6 +17,7 @@ class ResourceEvent(enum.Enum):
17
  """Event that triggers a resource change."""
18
 
19
  NAME = "NAME"
 
20
  DESCRIPTION = "DESCRIPTION"
21
  ENCODING_FORMAT = "ENCODING_FORMAT"
22
  INCLUDES = "INCLUDES"
@@ -36,6 +37,12 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
36
  metadata: Metadata = st.session_state[Metadata]
37
  metadata.rename_distribution(old_name=old_name, new_name=new_name)
38
  resource.name = value
 
 
 
 
 
 
39
  elif event == ResourceEvent.DESCRIPTION:
40
  resource.description = value
41
  elif event == ResourceEvent.ENCODING_FORMAT:
 
17
  """Event that triggers a resource change."""
18
 
19
  NAME = "NAME"
20
+ ID = "ID"
21
  DESCRIPTION = "DESCRIPTION"
22
  ENCODING_FORMAT = "ENCODING_FORMAT"
23
  INCLUDES = "INCLUDES"
 
37
  metadata: Metadata = st.session_state[Metadata]
38
  metadata.rename_distribution(old_name=old_name, new_name=new_name)
39
  resource.name = value
40
+ elif event == ResourceEvent.ID:
41
+ old_id = resource.id
42
+ new_id = value
43
+ if old_id != new_id:
44
+ metadata: Metadata = st.session_state[Metadata]
45
+ metadata.rename_id(old_id=old_id, new_id=new_id)
46
  elif event == ResourceEvent.DESCRIPTION:
47
  resource.description = value
48
  elif event == ResourceEvent.ENCODING_FORMAT:
events/resources_test.py CHANGED
@@ -6,6 +6,7 @@ from .resources import _create_instance1_from_instance2
6
 
7
  def test_create_instance1_from_instance2():
8
  file_object = FileObject(
 
9
  name="name",
10
  description="description",
11
  contained_in=["foo", "bar"],
@@ -13,6 +14,7 @@ def test_create_instance1_from_instance2():
13
  )
14
  file_set = _create_instance1_from_instance2(file_object, FileSet)
15
  assert isinstance(file_set, FileSet)
 
16
  assert file_set.name == "name"
17
  assert file_set.description == "description"
18
  assert file_set.contained_in == ["foo", "bar"]
 
6
 
7
  def test_create_instance1_from_instance2():
8
  file_object = FileObject(
9
+ id="id",
10
  name="name",
11
  description="description",
12
  contained_in=["foo", "bar"],
 
14
  )
15
  file_set = _create_instance1_from_instance2(file_object, FileSet)
16
  assert isinstance(file_set, FileSet)
17
+ assert file_set.id == "id"
18
  assert file_set.name == "name"
19
  assert file_set.description == "description"
20
  assert file_set.contained_in == ["foo", "bar"]
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  etils[epath]
2
- mlcroissant>=1.0.1
3
  numpy
4
  pandas
5
  pytest
 
1
  etils[epath]
2
+ mlcroissant==1.0.3
3
  numpy
4
  pandas
5
  pytest
views/files.py CHANGED
@@ -84,7 +84,7 @@ def _render_resources_panel(files: list[Resource]) -> Resource | None:
84
  filename_to_file: dict[str, list[Resource]] = {}
85
  nodes = []
86
  for file in files:
87
- name = file.name
88
  filename_to_file[name] = file
89
  type = "FileObject" if isinstance(file, FileObject) else "FileSet"
90
  if file.contained_in:
@@ -141,7 +141,7 @@ def _render_upload_panel():
141
  record_sets = infer_record_sets(file, names)
142
  for record_set in record_sets:
143
  st.session_state[Metadata].add_record_set(record_set)
144
- st.session_state[SelectedResource] = file.name
145
 
146
  st.form_submit_button("Upload", on_click=handle_on_click)
147
 
@@ -159,7 +159,7 @@ def _render_resource_details(selected_file: Resource):
159
  """Renders the details of the selected resource."""
160
  file: FileObject | FileSet
161
  for i, file in enumerate(st.session_state[Metadata].distribution):
162
- if file.name == selected_file.name:
163
  is_file_object = isinstance(file, FileObject)
164
  index = (
165
  RESOURCE_TYPES.index(FILE_OBJECT)
@@ -209,14 +209,24 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
209
  args=(ResourceEvent.CONTAINED_IN, file, key),
210
  )
211
  key = f"{prefix}_name"
212
- st.text_input(
213
- needed_field("Name"),
214
- value=file.name,
215
- key=key,
216
- help=f"The name of the resource. {NAMES_INFO}",
217
- on_change=handle_resource_change,
218
- args=(ResourceEvent.NAME, file, key),
219
- )
 
 
 
 
 
 
 
 
 
 
220
  key = f"{prefix}_description"
221
  st.text_area(
222
  "Description",
 
84
  filename_to_file: dict[str, list[Resource]] = {}
85
  nodes = []
86
  for file in files:
87
+ name = file.get_name_or_id()
88
  filename_to_file[name] = file
89
  type = "FileObject" if isinstance(file, FileObject) else "FileSet"
90
  if file.contained_in:
 
141
  record_sets = infer_record_sets(file, names)
142
  for record_set in record_sets:
143
  st.session_state[Metadata].add_record_set(record_set)
144
+ st.session_state[SelectedResource] = file.get_name_or_id()
145
 
146
  st.form_submit_button("Upload", on_click=handle_on_click)
147
 
 
159
  """Renders the details of the selected resource."""
160
  file: FileObject | FileSet
161
  for i, file in enumerate(st.session_state[Metadata].distribution):
162
+ if file.get_name_or_id() == selected_file.get_name_or_id():
163
  is_file_object = isinstance(file, FileObject)
164
  index = (
165
  RESOURCE_TYPES.index(FILE_OBJECT)
 
209
  args=(ResourceEvent.CONTAINED_IN, file, key),
210
  )
211
  key = f"{prefix}_name"
212
+ if file.ctx.is_v0():
213
+ st.text_input(
214
+ needed_field("Name"),
215
+ value=file.name,
216
+ key=key,
217
+ help=f"The name of the resource. {NAMES_INFO}",
218
+ on_change=handle_resource_change,
219
+ args=(ResourceEvent.NAME, file, key),
220
+ )
221
+ else:
222
+ st.text_input(
223
+ needed_field("ID"),
224
+ value=file.id,
225
+ key=key,
226
+ help=f"The ID of the resource. {NAMES_INFO}",
227
+ on_change=handle_resource_change,
228
+ args=(ResourceEvent.ID, file, key),
229
+ )
230
  key = f"{prefix}_description"
231
  st.text_area(
232
  "Description",
views/jsonld.py CHANGED
@@ -14,6 +14,7 @@ def render_jsonld():
14
  for file in croissant.distribution:
15
  distribution.append(
16
  mlc.FileObject(
 
17
  name=file.name,
18
  description=file.description,
19
  content_url=file.content_url,
@@ -27,18 +28,19 @@ def render_jsonld():
27
  for _, field in record_set.get("fields", pd.DataFrame()).iterrows():
28
  fields.append(
29
  mlc.Field(
 
30
  name=field["name"],
31
  description=field["description"],
32
  data_types=field["data_type"],
33
  source=mlc.Source(
34
- uid=file.name,
35
- node_type="distribution",
36
  extract=mlc.Extract(column=field["name"]),
37
  ),
38
  )
39
  )
40
  record_sets.append(
41
  mlc.RecordSet(
 
42
  name=record_set["name"],
43
  description=record_set["description"],
44
  fields=fields,
@@ -46,6 +48,7 @@ def render_jsonld():
46
  )
47
  if croissant.metadata:
48
  metadata = mlc.Metadata(
 
49
  name=croissant.metadata.name,
50
  cite_as=croissant.metadata.cite_as,
51
  license=croissant.metadata.license,
 
14
  for file in croissant.distribution:
15
  distribution.append(
16
  mlc.FileObject(
17
+ id=file.id,
18
  name=file.name,
19
  description=file.description,
20
  content_url=file.content_url,
 
28
  for _, field in record_set.get("fields", pd.DataFrame()).iterrows():
29
  fields.append(
30
  mlc.Field(
31
+ id=field["id"],
32
  name=field["name"],
33
  description=field["description"],
34
  data_types=field["data_type"],
35
  source=mlc.Source(
36
+ distribution=file.name,
 
37
  extract=mlc.Extract(column=field["name"]),
38
  ),
39
  )
40
  )
41
  record_sets.append(
42
  mlc.RecordSet(
43
+ id=record_set["id"],
44
  name=record_set["name"],
45
  description=record_set["description"],
46
  fields=fields,
 
48
  )
49
  if croissant.metadata:
50
  metadata = mlc.Metadata(
51
+ id=croissant.metadata.id,
52
  name=croissant.metadata.name,
53
  cite_as=croissant.metadata.cite_as,
54
  license=croissant.metadata.license,
views/record_sets.py CHANGED
@@ -119,11 +119,18 @@ def _data_editor_key(record_set_key: int, record_set: RecordSet) -> str:
119
 
120
  def _get_possible_sources(metadata: Metadata) -> list[str]:
121
  possible_sources: list[str] = []
122
- for resource in metadata.distribution:
123
- possible_sources.append(resource.name)
124
- for record_set in metadata.record_sets:
125
- for field in record_set.fields:
126
- possible_sources.append(f"{record_set.name}/{field.name}")
 
 
 
 
 
 
 
127
  return possible_sources
128
 
129
 
@@ -132,18 +139,18 @@ Join = tuple[LeftOrRight, LeftOrRight]
132
 
133
 
134
  def _find_left_or_right(source: mlc.Source) -> LeftOrRight:
135
- uid = source.uid
136
- if "/" in uid:
137
- parts = uid.split("/")
138
  return (parts[0], parts[1])
139
  elif source.extract.column:
140
- return (uid, source.extract.column)
141
  elif source.extract.json_path:
142
- return (uid, source.extract.json_path)
143
  elif source.extract.file_property:
144
- return (uid, source.extract.file_property)
145
  else:
146
- return (uid, None)
147
 
148
 
149
  def _find_joins(fields: list[Field]) -> set[Join]:
@@ -159,7 +166,8 @@ def _find_joins(fields: list[Field]) -> set[Join]:
159
 
160
  def _handle_create_record_set():
161
  metadata: Metadata = st.session_state[Metadata]
162
- metadata.add_record_set(RecordSet(name="new-record-set", description=""))
 
163
 
164
 
165
  def _handle_remove_record_set(record_set_key: int):
@@ -188,6 +196,7 @@ def _handle_fields_change(record_set_key: int, record_set: RecordSet):
188
  for added_row in result["added_rows"]:
189
  data_type = str_to_mlc_data_type(added_row.get(FieldDataFrame.DATA_TYPE))
190
  field = Field(
 
191
  name=added_row.get(FieldDataFrame.NAME),
192
  description=added_row.get(FieldDataFrame.DESCRIPTION),
193
  data_types=[data_type],
@@ -235,15 +244,26 @@ def _render_left_panel():
235
  with st.expander(title, expanded=is_record_set_expanded(record_set)):
236
  col1, col2 = st.columns([1, 3])
237
  key = f"{prefix}-name"
238
- col1.text_input(
239
- needed_field("Name"),
240
- placeholder="Name without special character.",
241
- key=key,
242
- help=f"The name of the RecordSet. {NAMES_INFO}",
243
- value=record_set.name,
244
- on_change=handle_record_set_change,
245
- args=(RecordSetEvent.NAME, record_set, key),
246
- )
 
 
 
 
 
 
 
 
 
 
 
247
  key = f"{prefix}-description"
248
  col2.text_input(
249
  "Description",
@@ -452,15 +472,26 @@ def _render_right_panel():
452
  col1, col2, col3 = st.columns([1, 1, 1])
453
 
454
  key = f"{prefix}-name"
455
- col1.text_input(
456
- needed_field("Name"),
457
- placeholder="Name without special character.",
458
- key=key,
459
- help=f"The name of the field. {NAMES_INFO}",
460
- value=field.name,
461
- on_change=handle_field_change,
462
- args=(FieldEvent.NAME, field, key),
463
- )
 
 
 
 
 
 
 
 
 
 
 
464
  key = f"{prefix}-description"
465
  col2.text_input(
466
  "Description",
 
119
 
120
  def _get_possible_sources(metadata: Metadata) -> list[str]:
121
  possible_sources: list[str] = []
122
+ if metadata.ctx.is_v0():
123
+ for resource in metadata.distribution:
124
+ possible_sources.append(resource.name)
125
+ for record_set in metadata.record_sets:
126
+ for field in record_set.fields:
127
+ possible_sources.append(f"{record_set.name}/{field.name}")
128
+ else:
129
+ for resource in metadata.distribution:
130
+ possible_sources.append(resource.id)
131
+ for record_set in metadata.record_sets:
132
+ for field in record_set.fields:
133
+ possible_sources.append(field.id)
134
  return possible_sources
135
 
136
 
 
139
 
140
 
141
  def _find_left_or_right(source: mlc.Source) -> LeftOrRight:
142
+ uuid = source.uuid
143
+ if "/" in uuid:
144
+ parts = uuid.split("/")
145
  return (parts[0], parts[1])
146
  elif source.extract.column:
147
+ return (uuid, source.extract.column)
148
  elif source.extract.json_path:
149
+ return (uuid, source.extract.json_path)
150
  elif source.extract.file_property:
151
+ return (uuid, source.extract.file_property)
152
  else:
153
+ return (uuid, None)
154
 
155
 
156
  def _find_joins(fields: list[Field]) -> set[Join]:
 
166
 
167
  def _handle_create_record_set():
168
  metadata: Metadata = st.session_state[Metadata]
169
+ name = "new-record-set"
170
+ metadata.add_record_set(RecordSet(id=name, name=name, description=""))
171
 
172
 
173
  def _handle_remove_record_set(record_set_key: int):
 
196
  for added_row in result["added_rows"]:
197
  data_type = str_to_mlc_data_type(added_row.get(FieldDataFrame.DATA_TYPE))
198
  field = Field(
199
+ id=added_row.get(FieldDataFrame.NAME),
200
  name=added_row.get(FieldDataFrame.NAME),
201
  description=added_row.get(FieldDataFrame.DESCRIPTION),
202
  data_types=[data_type],
 
244
  with st.expander(title, expanded=is_record_set_expanded(record_set)):
245
  col1, col2 = st.columns([1, 3])
246
  key = f"{prefix}-name"
247
+ if record_set.ctx.is_v0():
248
+ col1.text_input(
249
+ needed_field("Name"),
250
+ placeholder="Name without special character.",
251
+ key=key,
252
+ help=f"The name of the RecordSet. {NAMES_INFO}",
253
+ value=record_set.name,
254
+ on_change=handle_record_set_change,
255
+ args=(RecordSetEvent.NAME, record_set, key),
256
+ )
257
+ else:
258
+ col1.text_input(
259
+ needed_field("ID"),
260
+ placeholder="ID without special character.",
261
+ key=key,
262
+ help=f"The ID of the resource. {NAMES_INFO}",
263
+ value=record_set.name,
264
+ on_change=handle_record_set_change,
265
+ args=(RecordSetEvent.ID, record_set, key),
266
+ )
267
  key = f"{prefix}-description"
268
  col2.text_input(
269
  "Description",
 
472
  col1, col2, col3 = st.columns([1, 1, 1])
473
 
474
  key = f"{prefix}-name"
475
+ if field.ctx.is_v0():
476
+ col1.text_input(
477
+ needed_field("Name"),
478
+ placeholder="Name without special character.",
479
+ key=key,
480
+ help=f"The name of the field. {NAMES_INFO}",
481
+ value=field.name,
482
+ on_change=handle_field_change,
483
+ args=(FieldEvent.NAME, field, key),
484
+ )
485
+ else:
486
+ col1.text_input(
487
+ needed_field("ID"),
488
+ placeholder="ID without special character.",
489
+ key=key,
490
+ help=f"The ID of the field. {NAMES_INFO}",
491
+ value=field.id,
492
+ on_change=handle_field_change,
493
+ args=(FieldEvent.ID, field, key),
494
+ )
495
  key = f"{prefix}-description"
496
  col2.text_input(
497
  "Description",
views/record_sets_test.py CHANGED
@@ -6,17 +6,19 @@ from views.record_sets import _find_joins
6
  def test_find_joins():
7
  fields = [
8
  Field(
 
9
  name="field1",
10
  source=mlc.Source(
11
- uid="some_csv", extract=mlc.Extract(column="some_column")
12
  ),
13
- references=mlc.Source(uid="some_record_set/some_field"),
14
  ),
15
- Field(name="field2", source=mlc.Source(uid="foo/bar")),
16
  Field(
 
17
  name="field3",
18
- source=mlc.Source(uid="some_record_set/some_field"),
19
- references=mlc.Source(uid="some_other_record_set/some_other_field"),
20
  ),
21
  ]
22
  assert _find_joins(fields) == set(
 
6
  def test_find_joins():
7
  fields = [
8
  Field(
9
+ id="field1",
10
  name="field1",
11
  source=mlc.Source(
12
+ file_object="some_csv", extract=mlc.Extract(column="some_column")
13
  ),
14
+ references=mlc.Source(field="some_record_set/some_field"),
15
  ),
16
+ Field(id="field2", name="field2", source=mlc.Source(field="foo/bar")),
17
  Field(
18
+ id="field3",
19
  name="field3",
20
+ source=mlc.Source(field="some_record_set/some_field"),
21
+ references=mlc.Source(field="some_other_record_set/some_other_field"),
22
  ),
23
  ]
24
  assert _find_joins(fields) == set(
views/source.py CHANGED
@@ -123,7 +123,7 @@ def render_source(
123
  prefix = f"source-{record_set.name}-{field.name}"
124
  col1, col2, col3 = st.columns([1, 1, 1])
125
  index = (
126
- possible_sources.index(source.uid) if source.uid in possible_sources else None
127
  )
128
  options = [s for s in possible_sources if not s.startswith(record_set.name)]
129
  if index and (index < 0 or index >= len(options)):
@@ -140,7 +140,7 @@ def render_source(
140
  on_change=handle_field_change,
141
  args=(FieldEvent.SOURCE, field, key),
142
  )
143
- if source.node_type == "distribution":
144
  extract = col2.selectbox(
145
  needed_field("Extract"),
146
  index=_get_extract_index(source),
@@ -294,8 +294,8 @@ def render_references(
294
  if references or has_clicked_button:
295
  col1, col2, col3, col4 = st.columns([4.5, 4, 4, 1])
296
  index = (
297
- possible_sources.index(references.uid)
298
- if references.uid in possible_sources
299
  else None
300
  )
301
  options = [s for s in possible_sources if not s.startswith(record_set.name)]
@@ -310,7 +310,7 @@ def render_references(
310
  on_change=handle_field_change,
311
  args=(FieldEvent.REFERENCE, field, key),
312
  )
313
- if references.node_type == "distribution":
314
  key = f"{key}-extract-references"
315
  extract = col2.selectbox(
316
  needed_field("Extract the reference"),
 
123
  prefix = f"source-{record_set.name}-{field.name}"
124
  col1, col2, col3 = st.columns([1, 1, 1])
125
  index = (
126
+ possible_sources.index(source.uuid) if source.uuid in possible_sources else None
127
  )
128
  options = [s for s in possible_sources if not s.startswith(record_set.name)]
129
  if index and (index < 0 or index >= len(options)):
 
140
  on_change=handle_field_change,
141
  args=(FieldEvent.SOURCE, field, key),
142
  )
143
+ if source.distribution or source.file_object or source.file_set:
144
  extract = col2.selectbox(
145
  needed_field("Extract"),
146
  index=_get_extract_index(source),
 
294
  if references or has_clicked_button:
295
  col1, col2, col3, col4 = st.columns([4.5, 4, 4, 1])
296
  index = (
297
+ possible_sources.index(references.uuid)
298
+ if references.uuid in possible_sources
299
  else None
300
  )
301
  options = [s for s in possible_sources if not s.startswith(record_set.name)]
 
310
  on_change=handle_field_change,
311
  args=(FieldEvent.REFERENCE, field, key),
312
  )
313
+ if references.distribution or references.file_object or references.file_set:
314
  key = f"{key}-extract-references"
315
  extract = col2.selectbox(
316
  needed_field("Extract the reference"),
views/wizard.py CHANGED
@@ -23,8 +23,9 @@ from views.record_sets import render_record_sets
23
  def _export_json() -> str | None:
24
  metadata: Metadata = st.session_state[Metadata]
25
  try:
 
26
  return {
27
- "name": f"croissant-{metadata.name.lower()}.json",
28
  "content": json.dumps(metadata.to_canonical().to_json()),
29
  }
30
  except mlc.ValidationError as exception:
 
23
  def _export_json() -> str | None:
24
  metadata: Metadata = st.session_state[Metadata]
25
  try:
26
+ name = metadata.name or "metadata"
27
  return {
28
+ "name": f"croissant-{name.lower()}.json",
29
  "content": json.dumps(metadata.to_canonical().to_json()),
30
  }
31
  except mlc.ValidationError as exception: