Spaces:
Running
Running
Deploy (see actual commits on https://github.com/mlcommons/croissant).
Browse files- Makefile +4 -2
- core/files.py +32 -6
- core/files_test.py +47 -1
- core/record_sets.py +5 -3
- core/state.py +95 -55
- core/state_test.py +32 -0
- events/fields.py +13 -4
- events/metadata.py +3 -3
- events/record_sets.py +7 -0
- events/resources.py +7 -0
- events/resources_test.py +2 -0
- requirements.txt +1 -1
- views/files.py +21 -11
- views/jsonld.py +5 -2
- views/record_sets.py +62 -31
- views/record_sets_test.py +7 -5
- views/source.py +5 -5
- views/wizard.py +2 -1
Makefile
CHANGED
@@ -1,7 +1,9 @@
|
|
|
|
|
|
1 |
black:
|
2 |
-
black \
|
3 |
--line-length 88 \
|
4 |
-
--
|
5 |
.
|
6 |
|
7 |
isort:
|
|
|
1 |
+
current_dir := $(dir $(abspath $(firstword $(MAKEFILE_LIST))))
|
2 |
+
|
3 |
black:
|
4 |
+
docker run --rm --volume $(current_dir):/src --workdir /src pyfound/black:24.2.0 black \
|
5 |
--line-length 88 \
|
6 |
+
--exclude '.*\/node_modules\/' \
|
7 |
.
|
8 |
|
9 |
isort:
|
core/files.py
CHANGED
@@ -52,9 +52,12 @@ class FileTypes:
|
|
52 |
encoding_format="application/x-tar",
|
53 |
extensions=["tar"],
|
54 |
)
|
|
|
|
|
|
|
55 |
TXT = FileType(
|
56 |
name="Text",
|
57 |
-
encoding_format="plain
|
58 |
extensions=["txt"],
|
59 |
)
|
60 |
ZIP = FileType(
|
@@ -79,6 +82,7 @@ FILE_TYPES: dict[str, FileType] = {
|
|
79 |
FileTypes.JSONL,
|
80 |
FileTypes.PARQUET,
|
81 |
FileTypes.TAR,
|
|
|
82 |
FileTypes.TXT,
|
83 |
FileTypes.ZIP,
|
84 |
]
|
@@ -141,6 +145,8 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
|
|
141 |
df = pd.read_json(file, lines=True)
|
142 |
elif file_type == FileTypes.PARQUET:
|
143 |
df = pd.read_parquet(file)
|
|
|
|
|
144 |
else:
|
145 |
raise NotImplementedError(
|
146 |
f"File type {file_type} is not supported. Please, open an issue on GitHub:"
|
@@ -149,8 +155,22 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
|
|
149 |
return df.infer_objects()
|
150 |
|
151 |
|
152 |
-
def
|
|
|
153 |
mime = magic.from_file(path, mime=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
return ENCODING_FORMATS.get(mime)
|
155 |
|
156 |
|
@@ -163,8 +183,10 @@ def file_from_url(url: str, names: set[str], folder: epath.Path) -> FileObject:
|
|
163 |
sha256 = _sha256(file.read())
|
164 |
file_type = guess_file_type(file_path)
|
165 |
df = get_dataframe(file_type, file_path)
|
|
|
166 |
return FileObject(
|
167 |
-
name
|
|
|
168 |
description="",
|
169 |
content_url=url,
|
170 |
encoding_format=file_type.encoding_format,
|
@@ -186,8 +208,10 @@ def file_from_upload(
|
|
186 |
f.write(value)
|
187 |
file_type = guess_file_type(file_path)
|
188 |
df = get_dataframe(file_type, file)
|
|
|
189 |
return FileObject(
|
190 |
-
|
|
|
191 |
description="",
|
192 |
content_url=content_url,
|
193 |
encoding_format=file_type.encoding_format,
|
@@ -202,9 +226,11 @@ def file_from_form(
|
|
202 |
) -> FileObject | FileSet:
|
203 |
"""Creates a file based on manually added fields."""
|
204 |
if type == FILE_OBJECT:
|
205 |
-
|
|
|
206 |
elif type == FILE_SET:
|
207 |
-
|
|
|
208 |
else:
|
209 |
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
|
210 |
|
|
|
52 |
encoding_format="application/x-tar",
|
53 |
extensions=["tar"],
|
54 |
)
|
55 |
+
TSV = FileType(
|
56 |
+
name="TSV", encoding_format="text/tab-separated-values", extensions=["tsv"]
|
57 |
+
)
|
58 |
TXT = FileType(
|
59 |
name="Text",
|
60 |
+
encoding_format="text/plain",
|
61 |
extensions=["txt"],
|
62 |
)
|
63 |
ZIP = FileType(
|
|
|
82 |
FileTypes.JSONL,
|
83 |
FileTypes.PARQUET,
|
84 |
FileTypes.TAR,
|
85 |
+
FileTypes.TSV,
|
86 |
FileTypes.TXT,
|
87 |
FileTypes.ZIP,
|
88 |
]
|
|
|
145 |
df = pd.read_json(file, lines=True)
|
146 |
elif file_type == FileTypes.PARQUET:
|
147 |
df = pd.read_parquet(file)
|
148 |
+
elif file_type == FileTypes.TSV:
|
149 |
+
df = pd.read_csv(file, sep="\t")
|
150 |
else:
|
151 |
raise NotImplementedError(
|
152 |
f"File type {file_type} is not supported. Please, open an issue on GitHub:"
|
|
|
155 |
return df.infer_objects()
|
156 |
|
157 |
|
158 |
+
def _guess_mime_type(path: epath.Path) -> str:
|
159 |
+
"""Guess most specific MIME type."""
|
160 |
mime = magic.from_file(path, mime=True)
|
161 |
+
extension = path.suffix
|
162 |
+
if mime == "text/plain":
|
163 |
+
# In some cases, a CSV/TSV may be classified as text
|
164 |
+
# For example, if the file is not terminated by a newline
|
165 |
+
if extension == ".csv":
|
166 |
+
mime = "text/csv"
|
167 |
+
elif extension == ".tsv":
|
168 |
+
mime = "text/tab-separated-values"
|
169 |
+
return mime
|
170 |
+
|
171 |
+
|
172 |
+
def guess_file_type(path: epath.Path) -> FileType | None:
|
173 |
+
mime = _guess_mime_type(path)
|
174 |
return ENCODING_FORMATS.get(mime)
|
175 |
|
176 |
|
|
|
183 |
sha256 = _sha256(file.read())
|
184 |
file_type = guess_file_type(file_path)
|
185 |
df = get_dataframe(file_type, file_path)
|
186 |
+
name = find_unique_name(names, url.split("/")[-1])
|
187 |
return FileObject(
|
188 |
+
id=name,
|
189 |
+
name=name,
|
190 |
description="",
|
191 |
content_url=url,
|
192 |
encoding_format=file_type.encoding_format,
|
|
|
208 |
f.write(value)
|
209 |
file_type = guess_file_type(file_path)
|
210 |
df = get_dataframe(file_type, file)
|
211 |
+
name = find_unique_name(names, file.name)
|
212 |
return FileObject(
|
213 |
+
id=name,
|
214 |
+
name=name,
|
215 |
description="",
|
216 |
content_url=content_url,
|
217 |
encoding_format=file_type.encoding_format,
|
|
|
226 |
) -> FileObject | FileSet:
|
227 |
"""Creates a file based on manually added fields."""
|
228 |
if type == FILE_OBJECT:
|
229 |
+
name = find_unique_name(names, "file_object")
|
230 |
+
return FileObject(id=name, name=name, folder=folder)
|
231 |
elif type == FILE_SET:
|
232 |
+
name = find_unique_name(names, "file_set")
|
233 |
+
return FileSet(id=name, name=name)
|
234 |
else:
|
235 |
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
|
236 |
|
core/files_test.py
CHANGED
@@ -10,12 +10,13 @@ FileTypes = files_module.FileTypes
|
|
10 |
|
11 |
|
12 |
@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
|
13 |
-
def
|
14 |
del guess_file_type
|
15 |
csv = epath.Path(
|
16 |
# This is the hash path for "https://my.url".
|
17 |
"/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
|
18 |
)
|
|
|
19 |
if csv.exists():
|
20 |
csv.unlink()
|
21 |
with csv.open("w") as f:
|
@@ -28,6 +29,51 @@ def test_check_file_csv(guess_file_type):
|
|
28 |
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
|
29 |
)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
@mock.patch.object(files_module, "guess_file_type", return_value="unknown")
|
33 |
def test_check_file_unknown(guess_file_type):
|
|
|
10 |
|
11 |
|
12 |
@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
|
13 |
+
def test_check_file_csv_url(guess_file_type):
|
14 |
del guess_file_type
|
15 |
csv = epath.Path(
|
16 |
# This is the hash path for "https://my.url".
|
17 |
"/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
|
18 |
)
|
19 |
+
# Test unescaped CSV
|
20 |
if csv.exists():
|
21 |
csv.unlink()
|
22 |
with csv.open("w") as f:
|
|
|
29 |
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
|
30 |
)
|
31 |
|
32 |
+
# Test error thrown on no file
|
33 |
+
csv.unlink()
|
34 |
+
with pytest.raises(Exception):
|
35 |
+
files_module.file_from_url("https://my.url", set(), epath.Path())
|
36 |
+
|
37 |
+
# Test escaped CSV
|
38 |
+
content = b'"This","Is"\n1,2\n3,4'
|
39 |
+
with csv.open("wb") as f:
|
40 |
+
f.write(content)
|
41 |
+
file = files_module.file_from_url("https://my.url", set(), epath.Path())
|
42 |
+
pd.testing.assert_frame_equal(file.df, pd.DataFrame({"This": [1, 3], "Is": [2, 4]}))
|
43 |
+
|
44 |
+
|
45 |
+
@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.TSV)
|
46 |
+
def test_check_file_tsv_url(guess_file_type):
|
47 |
+
del guess_file_type
|
48 |
+
tsv = epath.Path(
|
49 |
+
# This is the hash path for "https://my.url".
|
50 |
+
"/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
|
51 |
+
)
|
52 |
+
# Test unescaped CSV
|
53 |
+
if tsv.exists():
|
54 |
+
tsv.unlink()
|
55 |
+
with tsv.open("w") as f:
|
56 |
+
f.write("column1\tcolumn2\n")
|
57 |
+
f.write("a\t1\n")
|
58 |
+
f.write("b\t2\n")
|
59 |
+
f.write("c\t3\n")
|
60 |
+
file = files_module.file_from_url("https://my.url", set(), epath.Path())
|
61 |
+
pd.testing.assert_frame_equal(
|
62 |
+
file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
|
63 |
+
)
|
64 |
+
|
65 |
+
# Test error thrown on no file
|
66 |
+
tsv.unlink()
|
67 |
+
with pytest.raises(Exception):
|
68 |
+
files_module.file_from_url("https://my.url", set(), epath.Path())
|
69 |
+
|
70 |
+
# Test escaped TSV
|
71 |
+
content = b'"This"\t"Is"\n1\t2\n3\t4'
|
72 |
+
with tsv.open("wb") as f:
|
73 |
+
f.write(content)
|
74 |
+
file = files_module.file_from_url("https://my.url", set(), epath.Path())
|
75 |
+
pd.testing.assert_frame_equal(file.df, pd.DataFrame({"This": [1, 3], "Is": [2, 4]}))
|
76 |
+
|
77 |
|
78 |
@mock.patch.object(files_module, "guess_file_type", return_value="unknown")
|
79 |
def test_check_file_unknown(guess_file_type):
|
core/record_sets.py
CHANGED
@@ -18,21 +18,23 @@ def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[Recor
|
|
18 |
fields = []
|
19 |
for column, value in file.df.dtypes.items():
|
20 |
source = mlc.Source(
|
21 |
-
|
22 |
-
node_type="distribution",
|
23 |
extract=mlc.Extract(column=column),
|
24 |
)
|
25 |
field = Field(
|
|
|
26 |
name=column,
|
27 |
data_types=[convert_dtype(value)],
|
28 |
source=source,
|
29 |
references=mlc.Source(),
|
30 |
)
|
31 |
fields.append(field)
|
|
|
32 |
return [
|
33 |
RecordSet(
|
|
|
34 |
fields=fields,
|
35 |
-
name=
|
36 |
description="",
|
37 |
)
|
38 |
]
|
|
|
18 |
fields = []
|
19 |
for column, value in file.df.dtypes.items():
|
20 |
source = mlc.Source(
|
21 |
+
distribution=file.id,
|
|
|
22 |
extract=mlc.Extract(column=column),
|
23 |
)
|
24 |
field = Field(
|
25 |
+
id=column,
|
26 |
name=column,
|
27 |
data_types=[convert_dtype(value)],
|
28 |
source=source,
|
29 |
references=mlc.Source(),
|
30 |
)
|
31 |
fields.append(field)
|
32 |
+
name = find_unique_name(names, file.name + "_record_set")
|
33 |
return [
|
34 |
RecordSet(
|
35 |
+
id=name,
|
36 |
fields=fields,
|
37 |
+
name=name,
|
38 |
description="",
|
39 |
)
|
40 |
]
|
core/state.py
CHANGED
@@ -9,6 +9,7 @@ import base64
|
|
9 |
import dataclasses
|
10 |
import datetime
|
11 |
from typing import Any
|
|
|
12 |
|
13 |
from etils import epath
|
14 |
import pandas as pd
|
@@ -33,9 +34,6 @@ def create_class(mlc_class: type, instance: Any, **kwargs) -> Any:
|
|
33 |
name = field.name
|
34 |
if hasattr(instance, name) and name not in kwargs:
|
35 |
params[name] = getattr(instance, name)
|
36 |
-
if "uuid" in params and params.get("uuid") is None:
|
37 |
-
# Let mlcroissant handle the default value
|
38 |
-
del params["uuid"]
|
39 |
return mlc_class(**params, **kwargs)
|
40 |
|
41 |
|
@@ -127,11 +125,22 @@ class SelectedRecordSet:
|
|
127 |
|
128 |
|
129 |
@dataclasses.dataclass
|
130 |
-
class
|
131 |
-
"""FileObject analogue for editor"""
|
132 |
-
|
133 |
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
|
|
|
134 |
name: str | None = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
description: str | None = None
|
136 |
contained_in: list[str] | None = dataclasses.field(default_factory=list)
|
137 |
content_size: str | None = None
|
@@ -140,65 +149,52 @@ class FileObject:
|
|
140 |
sha256: str | None = None
|
141 |
df: pd.DataFrame | None = None
|
142 |
folder: epath.PathLike | None = None
|
143 |
-
uuid: str | None = None
|
144 |
|
145 |
|
146 |
@dataclasses.dataclass
|
147 |
-
class FileSet:
|
148 |
"""FileSet analogue for editor"""
|
149 |
|
150 |
-
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
|
151 |
contained_in: list[str] = dataclasses.field(default_factory=list)
|
152 |
description: str | None = None
|
153 |
encoding_format: str | None = ""
|
154 |
includes: str | None = ""
|
155 |
-
name: str = ""
|
156 |
-
uuid: str | None = None
|
157 |
|
158 |
|
159 |
@dataclasses.dataclass
|
160 |
-
class Field:
|
161 |
"""Field analogue for editor"""
|
162 |
|
163 |
-
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
|
164 |
-
name: str | None = None
|
165 |
description: str | None = None
|
166 |
data_types: str | list[str] | None = None
|
167 |
source: mlc.Source | None = None
|
168 |
references: mlc.Source | None = None
|
169 |
-
uuid: str | None = None
|
170 |
|
171 |
|
172 |
@dataclasses.dataclass
|
173 |
-
class RecordSet:
|
174 |
"""Record Set analogue for editor"""
|
175 |
|
176 |
-
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
|
177 |
-
name: str = ""
|
178 |
data: list[Any] | None = None
|
|
|
179 |
description: str | None = None
|
180 |
is_enumeration: bool | None = None
|
181 |
key: str | list[str] | None = None
|
182 |
fields: list[Field] = dataclasses.field(default_factory=list)
|
183 |
-
uuid: str | None = None
|
184 |
|
185 |
|
186 |
@dataclasses.dataclass
|
187 |
-
class Metadata:
|
188 |
"""main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""
|
189 |
|
190 |
-
name: str = ""
|
191 |
description: str | None = None
|
192 |
cite_as: str | None = None
|
193 |
-
|
194 |
-
creators: list[mlc.PersonOrOrganization] = dataclasses.field(default_factory=list)
|
195 |
-
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
|
196 |
data_biases: str | None = None
|
197 |
data_collection: str | None = None
|
198 |
date_published: datetime.datetime | None = None
|
199 |
license: str | None = ""
|
200 |
personal_sensitive_information: str | None = None
|
201 |
-
uuid: str | None = None
|
202 |
url: str = ""
|
203 |
distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
|
204 |
record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
|
@@ -211,6 +207,8 @@ class Metadata:
|
|
211 |
"""Renames a resource by changing all the references to this resource."""
|
212 |
# Update other resources:
|
213 |
for i, resource in enumerate(self.distribution):
|
|
|
|
|
214 |
contained_in = resource.contained_in
|
215 |
if contained_in and old_name in contained_in:
|
216 |
self.distribution[i].contained_in = [
|
@@ -222,55 +220,89 @@ class Metadata:
|
|
222 |
def rename_record_set(self, old_name: str, new_name: str):
|
223 |
"""Renames a RecordSet by changing all the references to this RecordSet."""
|
224 |
for i, record_set in enumerate(self.record_sets):
|
|
|
|
|
225 |
for j, field in enumerate(record_set.fields):
|
226 |
-
|
227 |
# Update source
|
228 |
source = field.source
|
229 |
-
if (
|
230 |
-
source
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
self.record_sets[i].fields[j].source.
|
|
|
|
|
236 |
# Update references
|
237 |
references = field.references
|
238 |
if (
|
239 |
references
|
240 |
-
and references.
|
241 |
-
and (
|
242 |
-
references.uid.startswith(possible_uid)
|
243 |
-
or references.uid == old_name
|
244 |
-
)
|
245 |
):
|
246 |
-
|
247 |
-
self.record_sets[i].fields[j].references.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
|
249 |
def rename_field(self, old_name: str, new_name: str):
|
250 |
"""Renames a field by changing all the references to this field."""
|
251 |
for i, record_set in enumerate(self.record_sets):
|
252 |
for j, field in enumerate(record_set.fields):
|
|
|
253 |
# Update source
|
254 |
source = field.source
|
255 |
# The difference with RecordSet is the `.endswith` here:
|
256 |
-
if (
|
257 |
-
source
|
258 |
-
|
259 |
-
and "/" in source.uid
|
260 |
-
and source.uid.endswith(old_name)
|
261 |
-
):
|
262 |
-
new_uid = source.uid.replace(old_name, new_name, 1)
|
263 |
-
self.record_sets[i].fields[j].source.uid = new_uid
|
264 |
# Update references
|
265 |
references = field.references
|
266 |
if (
|
267 |
references
|
268 |
-
and references.
|
269 |
-
and
|
270 |
-
and references.uid.endswith(old_name)
|
271 |
):
|
272 |
-
|
273 |
-
self.record_sets[i].fields[j].references.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
def add_distribution(self, distribution: FileSet | FileObject) -> None:
|
276 |
self.distribution.append(distribution)
|
@@ -352,8 +384,16 @@ class Metadata:
|
|
352 |
)
|
353 |
|
354 |
def names(self) -> set[str]:
|
355 |
-
|
356 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
|
358 |
|
359 |
class OpenTab:
|
|
|
9 |
import dataclasses
|
10 |
import datetime
|
11 |
from typing import Any
|
12 |
+
import uuid
|
13 |
|
14 |
from etils import epath
|
15 |
import pandas as pd
|
|
|
34 |
name = field.name
|
35 |
if hasattr(instance, name) and name not in kwargs:
|
36 |
params[name] = getattr(instance, name)
|
|
|
|
|
|
|
37 |
return mlc_class(**params, **kwargs)
|
38 |
|
39 |
|
|
|
125 |
|
126 |
|
127 |
@dataclasses.dataclass
|
128 |
+
class Node:
|
|
|
|
|
129 |
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
|
130 |
+
id: str | None = None
|
131 |
name: str | None = None
|
132 |
+
|
133 |
+
def get_name_or_id(self):
|
134 |
+
if self.ctx.is_v0():
|
135 |
+
return self.name
|
136 |
+
else:
|
137 |
+
return self.id
|
138 |
+
|
139 |
+
|
140 |
+
@dataclasses.dataclass
|
141 |
+
class FileObject(Node):
|
142 |
+
"""FileObject analogue for editor"""
|
143 |
+
|
144 |
description: str | None = None
|
145 |
contained_in: list[str] | None = dataclasses.field(default_factory=list)
|
146 |
content_size: str | None = None
|
|
|
149 |
sha256: str | None = None
|
150 |
df: pd.DataFrame | None = None
|
151 |
folder: epath.PathLike | None = None
|
|
|
152 |
|
153 |
|
154 |
@dataclasses.dataclass
|
155 |
+
class FileSet(Node):
|
156 |
"""FileSet analogue for editor"""
|
157 |
|
|
|
158 |
contained_in: list[str] = dataclasses.field(default_factory=list)
|
159 |
description: str | None = None
|
160 |
encoding_format: str | None = ""
|
161 |
includes: str | None = ""
|
|
|
|
|
162 |
|
163 |
|
164 |
@dataclasses.dataclass
|
165 |
+
class Field(Node):
|
166 |
"""Field analogue for editor"""
|
167 |
|
|
|
|
|
168 |
description: str | None = None
|
169 |
data_types: str | list[str] | None = None
|
170 |
source: mlc.Source | None = None
|
171 |
references: mlc.Source | None = None
|
|
|
172 |
|
173 |
|
174 |
@dataclasses.dataclass
|
175 |
+
class RecordSet(Node):
|
176 |
"""Record Set analogue for editor"""
|
177 |
|
|
|
|
|
178 |
data: list[Any] | None = None
|
179 |
+
data_types: list[str] | None = None
|
180 |
description: str | None = None
|
181 |
is_enumeration: bool | None = None
|
182 |
key: str | list[str] | None = None
|
183 |
fields: list[Field] = dataclasses.field(default_factory=list)
|
|
|
184 |
|
185 |
|
186 |
@dataclasses.dataclass
|
187 |
+
class Metadata(Node):
|
188 |
"""main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""
|
189 |
|
|
|
190 |
description: str | None = None
|
191 |
cite_as: str | None = None
|
192 |
+
creators: list[mlc.Person] = dataclasses.field(default_factory=list)
|
|
|
|
|
193 |
data_biases: str | None = None
|
194 |
data_collection: str | None = None
|
195 |
date_published: datetime.datetime | None = None
|
196 |
license: str | None = ""
|
197 |
personal_sensitive_information: str | None = None
|
|
|
198 |
url: str = ""
|
199 |
distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
|
200 |
record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
|
|
|
207 |
"""Renames a resource by changing all the references to this resource."""
|
208 |
# Update other resources:
|
209 |
for i, resource in enumerate(self.distribution):
|
210 |
+
if resource.id == old_name:
|
211 |
+
self.distribution[i].id = new_name
|
212 |
contained_in = resource.contained_in
|
213 |
if contained_in and old_name in contained_in:
|
214 |
self.distribution[i].contained_in = [
|
|
|
220 |
def rename_record_set(self, old_name: str, new_name: str):
|
221 |
"""Renames a RecordSet by changing all the references to this RecordSet."""
|
222 |
for i, record_set in enumerate(self.record_sets):
|
223 |
+
if record_set.id == old_name:
|
224 |
+
self.record_sets[i].id = new_name
|
225 |
for j, field in enumerate(record_set.fields):
|
226 |
+
possible_uuid = f"{old_name}/"
|
227 |
# Update source
|
228 |
source = field.source
|
229 |
+
if source and source.field and source.field.startswith(possible_uuid):
|
230 |
+
new_uuid = source.field.replace(old_name, new_name, 1)
|
231 |
+
self.record_sets[i].fields[j].source.field = new_uuid
|
232 |
+
if source and source.file_object and source.file_object == old_name:
|
233 |
+
self.record_sets[i].fields[j].source.file_object = new_name
|
234 |
+
if source and source.file_set and source.file_set == old_name:
|
235 |
+
self.record_sets[i].fields[j].source.file_set = new_name
|
236 |
+
if source and source.distribution and source.distribution == old_name:
|
237 |
+
self.record_sets[i].fields[j].source.distribution = new_name
|
238 |
# Update references
|
239 |
references = field.references
|
240 |
if (
|
241 |
references
|
242 |
+
and references.field
|
243 |
+
and references.field.startswith(possible_uuid)
|
|
|
|
|
|
|
244 |
):
|
245 |
+
new_uuid = references.field.replace(old_name, new_name, 1)
|
246 |
+
self.record_sets[i].fields[j].references.field = new_uuid
|
247 |
+
if (
|
248 |
+
references
|
249 |
+
and references.file_object
|
250 |
+
and references.file_object == old_name
|
251 |
+
):
|
252 |
+
self.record_sets[i].fields[j].references.file_object = new_name
|
253 |
+
if (
|
254 |
+
references
|
255 |
+
and references.file_set
|
256 |
+
and references.file_set == old_name
|
257 |
+
):
|
258 |
+
self.record_sets[i].fields[j].references.file_set = new_name
|
259 |
+
if (
|
260 |
+
references
|
261 |
+
and references.distribution
|
262 |
+
and references.distribution == old_name
|
263 |
+
):
|
264 |
+
self.record_sets[i].fields[j].references.distribution = new_name
|
265 |
|
266 |
def rename_field(self, old_name: str, new_name: str):
|
267 |
"""Renames a field by changing all the references to this field."""
|
268 |
for i, record_set in enumerate(self.record_sets):
|
269 |
for j, field in enumerate(record_set.fields):
|
270 |
+
possible_uuid = f"/{old_name}"
|
271 |
# Update source
|
272 |
source = field.source
|
273 |
# The difference with RecordSet is the `.endswith` here:
|
274 |
+
if source and source.field and source.field.endswith(possible_uuid):
|
275 |
+
new_uuid = source.field.replace(old_name, new_name, 1)
|
276 |
+
self.record_sets[i].fields[j].source.field = new_uuid
|
|
|
|
|
|
|
|
|
|
|
277 |
# Update references
|
278 |
references = field.references
|
279 |
if (
|
280 |
references
|
281 |
+
and references.field
|
282 |
+
and references.field.endswith(possible_uuid)
|
|
|
283 |
):
|
284 |
+
new_uuid = references.field.replace(old_name, new_name, 1)
|
285 |
+
self.record_sets[i].fields[j].references.field = new_uuid
|
286 |
+
|
287 |
+
def rename_id(self, old_id: str, new_id: str):
|
288 |
+
for resource in self.distribution:
|
289 |
+
if resource.id == old_id:
|
290 |
+
resource.id = new_id
|
291 |
+
if resource.contained_in and old_id in resource.contained_in:
|
292 |
+
resource.contained_in = [
|
293 |
+
new_id if uuid == old_id else uuid for uuid in resource.contained_in
|
294 |
+
]
|
295 |
+
for record_set in self.record_sets:
|
296 |
+
if record_set.id == old_id:
|
297 |
+
record_set.id = new_id
|
298 |
+
for field in record_set.fields:
|
299 |
+
if field.id == old_id:
|
300 |
+
field.id = new_id
|
301 |
+
for p in ["distribution", "field", "file_object", "file_set"]:
|
302 |
+
if field.source and getattr(field.source, p) == old_id:
|
303 |
+
setattr(field.source, p, new_id)
|
304 |
+
if field.references and getattr(field.references, p) == old_id:
|
305 |
+
setattr(field.references, p, new_id)
|
306 |
|
307 |
def add_distribution(self, distribution: FileSet | FileObject) -> None:
|
308 |
self.distribution.append(distribution)
|
|
|
384 |
)
|
385 |
|
386 |
def names(self) -> set[str]:
|
387 |
+
distribution = set()
|
388 |
+
record_sets = set()
|
389 |
+
fields = set()
|
390 |
+
for resource in self.distribution:
|
391 |
+
distribution.add(resource.get_name_or_id())
|
392 |
+
for record_set in self.record_sets:
|
393 |
+
record_sets.add(record_set.get_name_or_id())
|
394 |
+
for field in record_set.fields:
|
395 |
+
fields.add(field.get_name_or_id())
|
396 |
+
return distribution.union(record_sets).union(fields)
|
397 |
|
398 |
|
399 |
class OpenTab:
|
core/state_test.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Tests for state."""
|
2 |
+
|
3 |
+
from etils import epath
|
4 |
+
|
5 |
+
import mlcroissant as mlc
|
6 |
+
|
7 |
+
from .state import Metadata
|
8 |
+
|
9 |
+
|
10 |
+
def test_rename_record_set():
|
11 |
+
ctx = mlc.Context()
|
12 |
+
path = epath.Path(__file__).parent.parent / "cypress/fixtures/1.0/titanic.json"
|
13 |
+
canonical_metadata = mlc.Metadata.from_file(ctx, path)
|
14 |
+
metadata = Metadata.from_canonical(canonical_metadata)
|
15 |
+
|
16 |
+
# Rename RecordSet:
|
17 |
+
assert metadata.record_sets[0].id == "genders"
|
18 |
+
assert metadata.record_sets[2].fields[1].id == "passengers/gender"
|
19 |
+
assert metadata.record_sets[2].fields[1].references.field == "genders/label"
|
20 |
+
metadata.rename_record_set("genders", "NEW_GENDERS")
|
21 |
+
assert metadata.record_sets[0].id == "NEW_GENDERS"
|
22 |
+
assert metadata.record_sets[2].fields[1].references.field == "NEW_GENDERS/label"
|
23 |
+
|
24 |
+
# Rename Field:
|
25 |
+
metadata.rename_field("label", "NEW_LABEL")
|
26 |
+
assert metadata.record_sets[2].fields[1].references.field == "NEW_GENDERS/NEW_LABEL"
|
27 |
+
|
28 |
+
# Rename Distribution:
|
29 |
+
assert metadata.record_sets[2].fields[0].id == "passengers/name"
|
30 |
+
assert metadata.record_sets[2].fields[0].source.file_object == "passengers.csv"
|
31 |
+
metadata.rename_distribution("passengers.csv", "NEW_PASSENGERS.CSV")
|
32 |
+
assert metadata.record_sets[2].fields[0].source.file_object == "NEW_PASSENGERS.CSV"
|
events/fields.py
CHANGED
@@ -58,6 +58,7 @@ class FieldEvent(enum.Enum):
|
|
58 |
"""Event that triggers a field change."""
|
59 |
|
60 |
NAME = "NAME"
|
|
|
61 |
DESCRIPTION = "DESCRIPTION"
|
62 |
DATA_TYPE = "DATA_TYPE"
|
63 |
SOURCE = "SOURCE"
|
@@ -86,13 +87,20 @@ def handle_field_change(
|
|
86 |
metadata: Metadata = st.session_state[Metadata]
|
87 |
metadata.rename_field(old_name=old_name, new_name=new_name)
|
88 |
field.name = value
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
elif change == FieldEvent.DESCRIPTION:
|
90 |
field.description = value
|
91 |
elif change == FieldEvent.DATA_TYPE:
|
92 |
field.data_types = [str_to_mlc_data_type(value)]
|
93 |
elif change == FieldEvent.SOURCE:
|
94 |
-
|
95 |
-
|
|
|
96 |
field.source = source
|
97 |
elif change == FieldEvent.SOURCE_EXTRACT:
|
98 |
source = field.source
|
@@ -131,8 +139,9 @@ def handle_field_change(
|
|
131 |
if number is not None and number < len(field.source.transforms):
|
132 |
field.source.transforms[number] = mlc.Transform(separator=value)
|
133 |
elif change == FieldEvent.REFERENCE:
|
134 |
-
|
135 |
-
|
|
|
136 |
field.references = source
|
137 |
elif change == FieldEvent.REFERENCE_EXTRACT:
|
138 |
source = field.references
|
|
|
58 |
"""Event that triggers a field change."""
|
59 |
|
60 |
NAME = "NAME"
|
61 |
+
ID = "ID"
|
62 |
DESCRIPTION = "DESCRIPTION"
|
63 |
DATA_TYPE = "DATA_TYPE"
|
64 |
SOURCE = "SOURCE"
|
|
|
87 |
metadata: Metadata = st.session_state[Metadata]
|
88 |
metadata.rename_field(old_name=old_name, new_name=new_name)
|
89 |
field.name = value
|
90 |
+
elif change == FieldEvent.ID:
|
91 |
+
old_id = field.id
|
92 |
+
new_id = value
|
93 |
+
if old_id != new_id:
|
94 |
+
metadata: Metadata = st.session_state[Metadata]
|
95 |
+
metadata.rename_id(old_id=old_id, new_id=new_id)
|
96 |
elif change == FieldEvent.DESCRIPTION:
|
97 |
field.description = value
|
98 |
elif change == FieldEvent.DATA_TYPE:
|
99 |
field.data_types = [str_to_mlc_data_type(value)]
|
100 |
elif change == FieldEvent.SOURCE:
|
101 |
+
source = (
|
102 |
+
mlc.Source(field=value) if "/" in value else mlc.Source(file_object=value)
|
103 |
+
)
|
104 |
field.source = source
|
105 |
elif change == FieldEvent.SOURCE_EXTRACT:
|
106 |
source = field.source
|
|
|
139 |
if number is not None and number < len(field.source.transforms):
|
140 |
field.source.transforms[number] = mlc.Transform(separator=value)
|
141 |
elif change == FieldEvent.REFERENCE:
|
142 |
+
source = (
|
143 |
+
mlc.Source(field=value) if "/" in value else mlc.Source(file_object=value)
|
144 |
+
)
|
145 |
field.references = source
|
146 |
elif change == FieldEvent.REFERENCE_EXTRACT:
|
147 |
source = field.references
|
events/metadata.py
CHANGED
@@ -130,16 +130,16 @@ def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
|
|
130 |
date = st.session_state[key]
|
131 |
metadata.date_published = datetime.datetime(date.year, date.month, date.day)
|
132 |
elif event == MetadataEvent.CREATOR_ADD:
|
133 |
-
metadata.creators = [mlc.
|
134 |
elif event == MetadataEvent.CREATOR_REMOVE:
|
135 |
metadata.creators = []
|
136 |
elif event == MetadataEvent.CREATOR_NAME:
|
137 |
if metadata.creators:
|
138 |
metadata.creators[0].name = st.session_state[key]
|
139 |
else:
|
140 |
-
metadata.creators = [mlc.
|
141 |
elif event == MetadataEvent.CREATOR_URL:
|
142 |
if metadata.creators:
|
143 |
metadata.creators[0].url = st.session_state[key]
|
144 |
else:
|
145 |
-
metadata.creators = [mlc.
|
|
|
130 |
date = st.session_state[key]
|
131 |
metadata.date_published = datetime.datetime(date.year, date.month, date.day)
|
132 |
elif event == MetadataEvent.CREATOR_ADD:
|
133 |
+
metadata.creators = [mlc.Person()]
|
134 |
elif event == MetadataEvent.CREATOR_REMOVE:
|
135 |
metadata.creators = []
|
136 |
elif event == MetadataEvent.CREATOR_NAME:
|
137 |
if metadata.creators:
|
138 |
metadata.creators[0].name = st.session_state[key]
|
139 |
else:
|
140 |
+
metadata.creators = [mlc.Person(name=st.session_state[key])]
|
141 |
elif event == MetadataEvent.CREATOR_URL:
|
142 |
if metadata.creators:
|
143 |
metadata.creators[0].url = st.session_state[key]
|
144 |
else:
|
145 |
+
metadata.creators = [mlc.Person(url=st.session_state[key])]
|
events/record_sets.py
CHANGED
@@ -11,6 +11,7 @@ class RecordSetEvent(enum.Enum):
|
|
11 |
"""Event that triggers a RecordSet change."""
|
12 |
|
13 |
NAME = "NAME"
|
|
|
14 |
DESCRIPTION = "DESCRIPTION"
|
15 |
IS_ENUMERATION = "IS_ENUMERATION"
|
16 |
HAS_DATA = "HAS_DATA"
|
@@ -26,6 +27,12 @@ def handle_record_set_change(event: RecordSetEvent, record_set: RecordSet, key:
|
|
26 |
metadata: Metadata = st.session_state[Metadata]
|
27 |
metadata.rename_record_set(old_name=old_name, new_name=new_name)
|
28 |
record_set.name = value
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
elif event == RecordSetEvent.DESCRIPTION:
|
30 |
record_set.description = value
|
31 |
elif event == RecordSetEvent.IS_ENUMERATION:
|
|
|
11 |
"""Event that triggers a RecordSet change."""
|
12 |
|
13 |
NAME = "NAME"
|
14 |
+
ID = "ID"
|
15 |
DESCRIPTION = "DESCRIPTION"
|
16 |
IS_ENUMERATION = "IS_ENUMERATION"
|
17 |
HAS_DATA = "HAS_DATA"
|
|
|
27 |
metadata: Metadata = st.session_state[Metadata]
|
28 |
metadata.rename_record_set(old_name=old_name, new_name=new_name)
|
29 |
record_set.name = value
|
30 |
+
elif event == RecordSetEvent.ID:
|
31 |
+
old_id = record_set.id
|
32 |
+
new_id = value
|
33 |
+
if old_id != new_id:
|
34 |
+
metadata: Metadata = st.session_state[Metadata]
|
35 |
+
metadata.rename_id(old_id=old_id, new_id=new_id)
|
36 |
elif event == RecordSetEvent.DESCRIPTION:
|
37 |
record_set.description = value
|
38 |
elif event == RecordSetEvent.IS_ENUMERATION:
|
events/resources.py
CHANGED
@@ -17,6 +17,7 @@ class ResourceEvent(enum.Enum):
|
|
17 |
"""Event that triggers a resource change."""
|
18 |
|
19 |
NAME = "NAME"
|
|
|
20 |
DESCRIPTION = "DESCRIPTION"
|
21 |
ENCODING_FORMAT = "ENCODING_FORMAT"
|
22 |
INCLUDES = "INCLUDES"
|
@@ -36,6 +37,12 @@ def handle_resource_change(event: ResourceEvent, resource: Resource, key: str):
|
|
36 |
metadata: Metadata = st.session_state[Metadata]
|
37 |
metadata.rename_distribution(old_name=old_name, new_name=new_name)
|
38 |
resource.name = value
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
elif event == ResourceEvent.DESCRIPTION:
|
40 |
resource.description = value
|
41 |
elif event == ResourceEvent.ENCODING_FORMAT:
|
|
|
17 |
"""Event that triggers a resource change."""
|
18 |
|
19 |
NAME = "NAME"
|
20 |
+
ID = "ID"
|
21 |
DESCRIPTION = "DESCRIPTION"
|
22 |
ENCODING_FORMAT = "ENCODING_FORMAT"
|
23 |
INCLUDES = "INCLUDES"
|
|
|
37 |
metadata: Metadata = st.session_state[Metadata]
|
38 |
metadata.rename_distribution(old_name=old_name, new_name=new_name)
|
39 |
resource.name = value
|
40 |
+
elif event == ResourceEvent.ID:
|
41 |
+
old_id = resource.id
|
42 |
+
new_id = value
|
43 |
+
if old_id != new_id:
|
44 |
+
metadata: Metadata = st.session_state[Metadata]
|
45 |
+
metadata.rename_id(old_id=old_id, new_id=new_id)
|
46 |
elif event == ResourceEvent.DESCRIPTION:
|
47 |
resource.description = value
|
48 |
elif event == ResourceEvent.ENCODING_FORMAT:
|
events/resources_test.py
CHANGED
@@ -6,6 +6,7 @@ from .resources import _create_instance1_from_instance2
|
|
6 |
|
7 |
def test_create_instance1_from_instance2():
|
8 |
file_object = FileObject(
|
|
|
9 |
name="name",
|
10 |
description="description",
|
11 |
contained_in=["foo", "bar"],
|
@@ -13,6 +14,7 @@ def test_create_instance1_from_instance2():
|
|
13 |
)
|
14 |
file_set = _create_instance1_from_instance2(file_object, FileSet)
|
15 |
assert isinstance(file_set, FileSet)
|
|
|
16 |
assert file_set.name == "name"
|
17 |
assert file_set.description == "description"
|
18 |
assert file_set.contained_in == ["foo", "bar"]
|
|
|
6 |
|
7 |
def test_create_instance1_from_instance2():
|
8 |
file_object = FileObject(
|
9 |
+
id="id",
|
10 |
name="name",
|
11 |
description="description",
|
12 |
contained_in=["foo", "bar"],
|
|
|
14 |
)
|
15 |
file_set = _create_instance1_from_instance2(file_object, FileSet)
|
16 |
assert isinstance(file_set, FileSet)
|
17 |
+
assert file_set.id == "id"
|
18 |
assert file_set.name == "name"
|
19 |
assert file_set.description == "description"
|
20 |
assert file_set.contained_in == ["foo", "bar"]
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
etils[epath]
|
2 |
-
mlcroissant
|
3 |
numpy
|
4 |
pandas
|
5 |
pytest
|
|
|
1 |
etils[epath]
|
2 |
+
mlcroissant==1.0.3
|
3 |
numpy
|
4 |
pandas
|
5 |
pytest
|
views/files.py
CHANGED
@@ -84,7 +84,7 @@ def _render_resources_panel(files: list[Resource]) -> Resource | None:
|
|
84 |
filename_to_file: dict[str, list[Resource]] = {}
|
85 |
nodes = []
|
86 |
for file in files:
|
87 |
-
name = file.
|
88 |
filename_to_file[name] = file
|
89 |
type = "FileObject" if isinstance(file, FileObject) else "FileSet"
|
90 |
if file.contained_in:
|
@@ -141,7 +141,7 @@ def _render_upload_panel():
|
|
141 |
record_sets = infer_record_sets(file, names)
|
142 |
for record_set in record_sets:
|
143 |
st.session_state[Metadata].add_record_set(record_set)
|
144 |
-
st.session_state[SelectedResource] = file.
|
145 |
|
146 |
st.form_submit_button("Upload", on_click=handle_on_click)
|
147 |
|
@@ -159,7 +159,7 @@ def _render_resource_details(selected_file: Resource):
|
|
159 |
"""Renders the details of the selected resource."""
|
160 |
file: FileObject | FileSet
|
161 |
for i, file in enumerate(st.session_state[Metadata].distribution):
|
162 |
-
if file.
|
163 |
is_file_object = isinstance(file, FileObject)
|
164 |
index = (
|
165 |
RESOURCE_TYPES.index(FILE_OBJECT)
|
@@ -209,14 +209,24 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
209 |
args=(ResourceEvent.CONTAINED_IN, file, key),
|
210 |
)
|
211 |
key = f"{prefix}_name"
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
key = f"{prefix}_description"
|
221 |
st.text_area(
|
222 |
"Description",
|
|
|
84 |
filename_to_file: dict[str, list[Resource]] = {}
|
85 |
nodes = []
|
86 |
for file in files:
|
87 |
+
name = file.get_name_or_id()
|
88 |
filename_to_file[name] = file
|
89 |
type = "FileObject" if isinstance(file, FileObject) else "FileSet"
|
90 |
if file.contained_in:
|
|
|
141 |
record_sets = infer_record_sets(file, names)
|
142 |
for record_set in record_sets:
|
143 |
st.session_state[Metadata].add_record_set(record_set)
|
144 |
+
st.session_state[SelectedResource] = file.get_name_or_id()
|
145 |
|
146 |
st.form_submit_button("Upload", on_click=handle_on_click)
|
147 |
|
|
|
159 |
"""Renders the details of the selected resource."""
|
160 |
file: FileObject | FileSet
|
161 |
for i, file in enumerate(st.session_state[Metadata].distribution):
|
162 |
+
if file.get_name_or_id() == selected_file.get_name_or_id():
|
163 |
is_file_object = isinstance(file, FileObject)
|
164 |
index = (
|
165 |
RESOURCE_TYPES.index(FILE_OBJECT)
|
|
|
209 |
args=(ResourceEvent.CONTAINED_IN, file, key),
|
210 |
)
|
211 |
key = f"{prefix}_name"
|
212 |
+
if file.ctx.is_v0():
|
213 |
+
st.text_input(
|
214 |
+
needed_field("Name"),
|
215 |
+
value=file.name,
|
216 |
+
key=key,
|
217 |
+
help=f"The name of the resource. {NAMES_INFO}",
|
218 |
+
on_change=handle_resource_change,
|
219 |
+
args=(ResourceEvent.NAME, file, key),
|
220 |
+
)
|
221 |
+
else:
|
222 |
+
st.text_input(
|
223 |
+
needed_field("ID"),
|
224 |
+
value=file.id,
|
225 |
+
key=key,
|
226 |
+
help=f"The ID of the resource. {NAMES_INFO}",
|
227 |
+
on_change=handle_resource_change,
|
228 |
+
args=(ResourceEvent.ID, file, key),
|
229 |
+
)
|
230 |
key = f"{prefix}_description"
|
231 |
st.text_area(
|
232 |
"Description",
|
views/jsonld.py
CHANGED
@@ -14,6 +14,7 @@ def render_jsonld():
|
|
14 |
for file in croissant.distribution:
|
15 |
distribution.append(
|
16 |
mlc.FileObject(
|
|
|
17 |
name=file.name,
|
18 |
description=file.description,
|
19 |
content_url=file.content_url,
|
@@ -27,18 +28,19 @@ def render_jsonld():
|
|
27 |
for _, field in record_set.get("fields", pd.DataFrame()).iterrows():
|
28 |
fields.append(
|
29 |
mlc.Field(
|
|
|
30 |
name=field["name"],
|
31 |
description=field["description"],
|
32 |
data_types=field["data_type"],
|
33 |
source=mlc.Source(
|
34 |
-
|
35 |
-
node_type="distribution",
|
36 |
extract=mlc.Extract(column=field["name"]),
|
37 |
),
|
38 |
)
|
39 |
)
|
40 |
record_sets.append(
|
41 |
mlc.RecordSet(
|
|
|
42 |
name=record_set["name"],
|
43 |
description=record_set["description"],
|
44 |
fields=fields,
|
@@ -46,6 +48,7 @@ def render_jsonld():
|
|
46 |
)
|
47 |
if croissant.metadata:
|
48 |
metadata = mlc.Metadata(
|
|
|
49 |
name=croissant.metadata.name,
|
50 |
cite_as=croissant.metadata.cite_as,
|
51 |
license=croissant.metadata.license,
|
|
|
14 |
for file in croissant.distribution:
|
15 |
distribution.append(
|
16 |
mlc.FileObject(
|
17 |
+
id=file.id,
|
18 |
name=file.name,
|
19 |
description=file.description,
|
20 |
content_url=file.content_url,
|
|
|
28 |
for _, field in record_set.get("fields", pd.DataFrame()).iterrows():
|
29 |
fields.append(
|
30 |
mlc.Field(
|
31 |
+
id=field["id"],
|
32 |
name=field["name"],
|
33 |
description=field["description"],
|
34 |
data_types=field["data_type"],
|
35 |
source=mlc.Source(
|
36 |
+
distribution=file.name,
|
|
|
37 |
extract=mlc.Extract(column=field["name"]),
|
38 |
),
|
39 |
)
|
40 |
)
|
41 |
record_sets.append(
|
42 |
mlc.RecordSet(
|
43 |
+
id=record_set["id"],
|
44 |
name=record_set["name"],
|
45 |
description=record_set["description"],
|
46 |
fields=fields,
|
|
|
48 |
)
|
49 |
if croissant.metadata:
|
50 |
metadata = mlc.Metadata(
|
51 |
+
id=croissant.metadata.id,
|
52 |
name=croissant.metadata.name,
|
53 |
cite_as=croissant.metadata.cite_as,
|
54 |
license=croissant.metadata.license,
|
views/record_sets.py
CHANGED
@@ -119,11 +119,18 @@ def _data_editor_key(record_set_key: int, record_set: RecordSet) -> str:
|
|
119 |
|
120 |
def _get_possible_sources(metadata: Metadata) -> list[str]:
|
121 |
possible_sources: list[str] = []
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
for
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
return possible_sources
|
128 |
|
129 |
|
@@ -132,18 +139,18 @@ Join = tuple[LeftOrRight, LeftOrRight]
|
|
132 |
|
133 |
|
134 |
def _find_left_or_right(source: mlc.Source) -> LeftOrRight:
|
135 |
-
|
136 |
-
if "/" in
|
137 |
-
parts =
|
138 |
return (parts[0], parts[1])
|
139 |
elif source.extract.column:
|
140 |
-
return (
|
141 |
elif source.extract.json_path:
|
142 |
-
return (
|
143 |
elif source.extract.file_property:
|
144 |
-
return (
|
145 |
else:
|
146 |
-
return (
|
147 |
|
148 |
|
149 |
def _find_joins(fields: list[Field]) -> set[Join]:
|
@@ -159,7 +166,8 @@ def _find_joins(fields: list[Field]) -> set[Join]:
|
|
159 |
|
160 |
def _handle_create_record_set():
|
161 |
metadata: Metadata = st.session_state[Metadata]
|
162 |
-
|
|
|
163 |
|
164 |
|
165 |
def _handle_remove_record_set(record_set_key: int):
|
@@ -188,6 +196,7 @@ def _handle_fields_change(record_set_key: int, record_set: RecordSet):
|
|
188 |
for added_row in result["added_rows"]:
|
189 |
data_type = str_to_mlc_data_type(added_row.get(FieldDataFrame.DATA_TYPE))
|
190 |
field = Field(
|
|
|
191 |
name=added_row.get(FieldDataFrame.NAME),
|
192 |
description=added_row.get(FieldDataFrame.DESCRIPTION),
|
193 |
data_types=[data_type],
|
@@ -235,15 +244,26 @@ def _render_left_panel():
|
|
235 |
with st.expander(title, expanded=is_record_set_expanded(record_set)):
|
236 |
col1, col2 = st.columns([1, 3])
|
237 |
key = f"{prefix}-name"
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
key = f"{prefix}-description"
|
248 |
col2.text_input(
|
249 |
"Description",
|
@@ -452,15 +472,26 @@ def _render_right_panel():
|
|
452 |
col1, col2, col3 = st.columns([1, 1, 1])
|
453 |
|
454 |
key = f"{prefix}-name"
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
key = f"{prefix}-description"
|
465 |
col2.text_input(
|
466 |
"Description",
|
|
|
119 |
|
120 |
def _get_possible_sources(metadata: Metadata) -> list[str]:
|
121 |
possible_sources: list[str] = []
|
122 |
+
if metadata.ctx.is_v0():
|
123 |
+
for resource in metadata.distribution:
|
124 |
+
possible_sources.append(resource.name)
|
125 |
+
for record_set in metadata.record_sets:
|
126 |
+
for field in record_set.fields:
|
127 |
+
possible_sources.append(f"{record_set.name}/{field.name}")
|
128 |
+
else:
|
129 |
+
for resource in metadata.distribution:
|
130 |
+
possible_sources.append(resource.id)
|
131 |
+
for record_set in metadata.record_sets:
|
132 |
+
for field in record_set.fields:
|
133 |
+
possible_sources.append(field.id)
|
134 |
return possible_sources
|
135 |
|
136 |
|
|
|
139 |
|
140 |
|
141 |
def _find_left_or_right(source: mlc.Source) -> LeftOrRight:
|
142 |
+
uuid = source.uuid
|
143 |
+
if "/" in uuid:
|
144 |
+
parts = uuid.split("/")
|
145 |
return (parts[0], parts[1])
|
146 |
elif source.extract.column:
|
147 |
+
return (uuid, source.extract.column)
|
148 |
elif source.extract.json_path:
|
149 |
+
return (uuid, source.extract.json_path)
|
150 |
elif source.extract.file_property:
|
151 |
+
return (uuid, source.extract.file_property)
|
152 |
else:
|
153 |
+
return (uuid, None)
|
154 |
|
155 |
|
156 |
def _find_joins(fields: list[Field]) -> set[Join]:
|
|
|
166 |
|
167 |
def _handle_create_record_set():
|
168 |
metadata: Metadata = st.session_state[Metadata]
|
169 |
+
name = "new-record-set"
|
170 |
+
metadata.add_record_set(RecordSet(id=name, name=name, description=""))
|
171 |
|
172 |
|
173 |
def _handle_remove_record_set(record_set_key: int):
|
|
|
196 |
for added_row in result["added_rows"]:
|
197 |
data_type = str_to_mlc_data_type(added_row.get(FieldDataFrame.DATA_TYPE))
|
198 |
field = Field(
|
199 |
+
id=added_row.get(FieldDataFrame.NAME),
|
200 |
name=added_row.get(FieldDataFrame.NAME),
|
201 |
description=added_row.get(FieldDataFrame.DESCRIPTION),
|
202 |
data_types=[data_type],
|
|
|
244 |
with st.expander(title, expanded=is_record_set_expanded(record_set)):
|
245 |
col1, col2 = st.columns([1, 3])
|
246 |
key = f"{prefix}-name"
|
247 |
+
if record_set.ctx.is_v0():
|
248 |
+
col1.text_input(
|
249 |
+
needed_field("Name"),
|
250 |
+
placeholder="Name without special character.",
|
251 |
+
key=key,
|
252 |
+
help=f"The name of the RecordSet. {NAMES_INFO}",
|
253 |
+
value=record_set.name,
|
254 |
+
on_change=handle_record_set_change,
|
255 |
+
args=(RecordSetEvent.NAME, record_set, key),
|
256 |
+
)
|
257 |
+
else:
|
258 |
+
col1.text_input(
|
259 |
+
needed_field("ID"),
|
260 |
+
placeholder="ID without special character.",
|
261 |
+
key=key,
|
262 |
+
help=f"The ID of the resource. {NAMES_INFO}",
|
263 |
+
value=record_set.name,
|
264 |
+
on_change=handle_record_set_change,
|
265 |
+
args=(RecordSetEvent.ID, record_set, key),
|
266 |
+
)
|
267 |
key = f"{prefix}-description"
|
268 |
col2.text_input(
|
269 |
"Description",
|
|
|
472 |
col1, col2, col3 = st.columns([1, 1, 1])
|
473 |
|
474 |
key = f"{prefix}-name"
|
475 |
+
if field.ctx.is_v0():
|
476 |
+
col1.text_input(
|
477 |
+
needed_field("Name"),
|
478 |
+
placeholder="Name without special character.",
|
479 |
+
key=key,
|
480 |
+
help=f"The name of the field. {NAMES_INFO}",
|
481 |
+
value=field.name,
|
482 |
+
on_change=handle_field_change,
|
483 |
+
args=(FieldEvent.NAME, field, key),
|
484 |
+
)
|
485 |
+
else:
|
486 |
+
col1.text_input(
|
487 |
+
needed_field("ID"),
|
488 |
+
placeholder="ID without special character.",
|
489 |
+
key=key,
|
490 |
+
help=f"The ID of the field. {NAMES_INFO}",
|
491 |
+
value=field.id,
|
492 |
+
on_change=handle_field_change,
|
493 |
+
args=(FieldEvent.ID, field, key),
|
494 |
+
)
|
495 |
key = f"{prefix}-description"
|
496 |
col2.text_input(
|
497 |
"Description",
|
views/record_sets_test.py
CHANGED
@@ -6,17 +6,19 @@ from views.record_sets import _find_joins
|
|
6 |
def test_find_joins():
|
7 |
fields = [
|
8 |
Field(
|
|
|
9 |
name="field1",
|
10 |
source=mlc.Source(
|
11 |
-
|
12 |
),
|
13 |
-
references=mlc.Source(
|
14 |
),
|
15 |
-
Field(name="field2", source=mlc.Source(
|
16 |
Field(
|
|
|
17 |
name="field3",
|
18 |
-
source=mlc.Source(
|
19 |
-
references=mlc.Source(
|
20 |
),
|
21 |
]
|
22 |
assert _find_joins(fields) == set(
|
|
|
6 |
def test_find_joins():
|
7 |
fields = [
|
8 |
Field(
|
9 |
+
id="field1",
|
10 |
name="field1",
|
11 |
source=mlc.Source(
|
12 |
+
file_object="some_csv", extract=mlc.Extract(column="some_column")
|
13 |
),
|
14 |
+
references=mlc.Source(field="some_record_set/some_field"),
|
15 |
),
|
16 |
+
Field(id="field2", name="field2", source=mlc.Source(field="foo/bar")),
|
17 |
Field(
|
18 |
+
id="field3",
|
19 |
name="field3",
|
20 |
+
source=mlc.Source(field="some_record_set/some_field"),
|
21 |
+
references=mlc.Source(field="some_other_record_set/some_other_field"),
|
22 |
),
|
23 |
]
|
24 |
assert _find_joins(fields) == set(
|
views/source.py
CHANGED
@@ -123,7 +123,7 @@ def render_source(
|
|
123 |
prefix = f"source-{record_set.name}-{field.name}"
|
124 |
col1, col2, col3 = st.columns([1, 1, 1])
|
125 |
index = (
|
126 |
-
possible_sources.index(source.
|
127 |
)
|
128 |
options = [s for s in possible_sources if not s.startswith(record_set.name)]
|
129 |
if index and (index < 0 or index >= len(options)):
|
@@ -140,7 +140,7 @@ def render_source(
|
|
140 |
on_change=handle_field_change,
|
141 |
args=(FieldEvent.SOURCE, field, key),
|
142 |
)
|
143 |
-
if source.
|
144 |
extract = col2.selectbox(
|
145 |
needed_field("Extract"),
|
146 |
index=_get_extract_index(source),
|
@@ -294,8 +294,8 @@ def render_references(
|
|
294 |
if references or has_clicked_button:
|
295 |
col1, col2, col3, col4 = st.columns([4.5, 4, 4, 1])
|
296 |
index = (
|
297 |
-
possible_sources.index(references.
|
298 |
-
if references.
|
299 |
else None
|
300 |
)
|
301 |
options = [s for s in possible_sources if not s.startswith(record_set.name)]
|
@@ -310,7 +310,7 @@ def render_references(
|
|
310 |
on_change=handle_field_change,
|
311 |
args=(FieldEvent.REFERENCE, field, key),
|
312 |
)
|
313 |
-
if references.
|
314 |
key = f"{key}-extract-references"
|
315 |
extract = col2.selectbox(
|
316 |
needed_field("Extract the reference"),
|
|
|
123 |
prefix = f"source-{record_set.name}-{field.name}"
|
124 |
col1, col2, col3 = st.columns([1, 1, 1])
|
125 |
index = (
|
126 |
+
possible_sources.index(source.uuid) if source.uuid in possible_sources else None
|
127 |
)
|
128 |
options = [s for s in possible_sources if not s.startswith(record_set.name)]
|
129 |
if index and (index < 0 or index >= len(options)):
|
|
|
140 |
on_change=handle_field_change,
|
141 |
args=(FieldEvent.SOURCE, field, key),
|
142 |
)
|
143 |
+
if source.distribution or source.file_object or source.file_set:
|
144 |
extract = col2.selectbox(
|
145 |
needed_field("Extract"),
|
146 |
index=_get_extract_index(source),
|
|
|
294 |
if references or has_clicked_button:
|
295 |
col1, col2, col3, col4 = st.columns([4.5, 4, 4, 1])
|
296 |
index = (
|
297 |
+
possible_sources.index(references.uuid)
|
298 |
+
if references.uuid in possible_sources
|
299 |
else None
|
300 |
)
|
301 |
options = [s for s in possible_sources if not s.startswith(record_set.name)]
|
|
|
310 |
on_change=handle_field_change,
|
311 |
args=(FieldEvent.REFERENCE, field, key),
|
312 |
)
|
313 |
+
if references.distribution or references.file_object or references.file_set:
|
314 |
key = f"{key}-extract-references"
|
315 |
extract = col2.selectbox(
|
316 |
needed_field("Extract the reference"),
|
views/wizard.py
CHANGED
@@ -23,8 +23,9 @@ from views.record_sets import render_record_sets
|
|
23 |
def _export_json() -> str | None:
|
24 |
metadata: Metadata = st.session_state[Metadata]
|
25 |
try:
|
|
|
26 |
return {
|
27 |
-
"name": f"croissant-{
|
28 |
"content": json.dumps(metadata.to_canonical().to_json()),
|
29 |
}
|
30 |
except mlc.ValidationError as exception:
|
|
|
23 |
def _export_json() -> str | None:
|
24 |
metadata: Metadata = st.session_state[Metadata]
|
25 |
try:
|
26 |
+
name = metadata.name or "metadata"
|
27 |
return {
|
28 |
+
"name": f"croissant-{name.lower()}.json",
|
29 |
"content": json.dumps(metadata.to_canonical().to_json()),
|
30 |
}
|
31 |
except mlc.ValidationError as exception:
|