Spaces:
Runtime error
Runtime error
Deploy (see actual commits on https://github.com/mlcommons/croissant).
Browse files- components/safe_button.py +36 -0
- components/tabs/frontend/build/asset-manifest.json +3 -3
- components/tabs/frontend/build/index.html +1 -1
- components/tabs/frontend/build/static/js/main.e6b754d8.js +0 -0
- components/tabs/frontend/build/static/js/main.e6b754d8.js.LICENSE.txt +73 -0
- components/tabs/frontend/build/static/js/main.e6b754d8.js.map +0 -0
- components/tabs/frontend/src/Tabs.tsx +30 -9
- core/constants.py +5 -0
- core/files.py +4 -1
- core/names.py +5 -0
- core/names_test.py +1 -0
- events/metadata.py +2 -1
- views/files.py +25 -6
- views/overview.py +11 -6
- views/record_sets.py +32 -18
- views/source.py +46 -11
- views/splash.py +32 -2
components/safe_button.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
HAS_CONFIRMED = "SAFELY_UPDATE"
|
4 |
+
|
5 |
+
|
6 |
+
def handle_on_click(on_click):
|
7 |
+
"""Handles on_click by waiting for the confirmation."""
|
8 |
+
if st.session_state.get(HAS_CONFIRMED):
|
9 |
+
return on_click
|
10 |
+
else:
|
11 |
+
|
12 |
+
def toggle_has_confirmed(*args, **kwargs):
|
13 |
+
del args, kwargs # unused.
|
14 |
+
st.session_state[HAS_CONFIRMED] = not st.session_state.get(HAS_CONFIRMED)
|
15 |
+
|
16 |
+
return toggle_has_confirmed
|
17 |
+
|
18 |
+
|
19 |
+
def button_with_confirmation(
|
20 |
+
label: str,
|
21 |
+
key: str = None,
|
22 |
+
on_click=None,
|
23 |
+
args=None,
|
24 |
+
kwargs=None,
|
25 |
+
):
|
26 |
+
"""Implements a safe button that asks for confirmation before executing on_click."""
|
27 |
+
st.button(
|
28 |
+
label,
|
29 |
+
on_click=handle_on_click(on_click),
|
30 |
+
args=args,
|
31 |
+
kwargs=kwargs,
|
32 |
+
key=key,
|
33 |
+
type="secondary",
|
34 |
+
)
|
35 |
+
if st.session_state.get(HAS_CONFIRMED):
|
36 |
+
st.error(f"Do you really want to {label.lower()}? Click again to confirm.")
|
components/tabs/frontend/build/asset-manifest.json
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
{
|
2 |
"files": {
|
3 |
-
"main.js": "./static/js/main.
|
4 |
"index.html": "./index.html",
|
5 |
-
"main.
|
6 |
},
|
7 |
"entrypoints": [
|
8 |
-
"static/js/main.
|
9 |
]
|
10 |
}
|
|
|
1 |
{
|
2 |
"files": {
|
3 |
+
"main.js": "./static/js/main.e6b754d8.js",
|
4 |
"index.html": "./index.html",
|
5 |
+
"main.e6b754d8.js.map": "./static/js/main.e6b754d8.js.map"
|
6 |
},
|
7 |
"entrypoints": [
|
8 |
+
"static/js/main.e6b754d8.js"
|
9 |
]
|
10 |
}
|
components/tabs/frontend/build/index.html
CHANGED
@@ -1 +1 @@
|
|
1 |
-
<!doctype html><html lang="en"><head><title>Streamlit Tabs Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.
|
|
|
1 |
+
<!doctype html><html lang="en"><head><title>Streamlit Tabs Component</title><meta charset="UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Streamlit Tree Component"/><script defer="defer" src="./static/js/main.e6b754d8.js"></script></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>
|
components/tabs/frontend/build/static/js/main.e6b754d8.js
ADDED
The diff for this file is too large to render.
See raw diff
|
|
components/tabs/frontend/build/static/js/main.e6b754d8.js.LICENSE.txt
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/*
|
2 |
+
object-assign
|
3 |
+
(c) Sindre Sorhus
|
4 |
+
@license MIT
|
5 |
+
*/
|
6 |
+
|
7 |
+
/**
|
8 |
+
* @license React
|
9 |
+
* react-dom.production.min.js
|
10 |
+
*
|
11 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
12 |
+
*
|
13 |
+
* This source code is licensed under the MIT license found in the
|
14 |
+
* LICENSE file in the root directory of this source tree.
|
15 |
+
*/
|
16 |
+
|
17 |
+
/**
|
18 |
+
* @license React
|
19 |
+
* react-is.production.min.js
|
20 |
+
*
|
21 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
22 |
+
*
|
23 |
+
* This source code is licensed under the MIT license found in the
|
24 |
+
* LICENSE file in the root directory of this source tree.
|
25 |
+
*/
|
26 |
+
|
27 |
+
/**
|
28 |
+
* @license React
|
29 |
+
* react-jsx-runtime.production.min.js
|
30 |
+
*
|
31 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
32 |
+
*
|
33 |
+
* This source code is licensed under the MIT license found in the
|
34 |
+
* LICENSE file in the root directory of this source tree.
|
35 |
+
*/
|
36 |
+
|
37 |
+
/**
|
38 |
+
* @license React
|
39 |
+
* react.production.min.js
|
40 |
+
*
|
41 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
42 |
+
*
|
43 |
+
* This source code is licensed under the MIT license found in the
|
44 |
+
* LICENSE file in the root directory of this source tree.
|
45 |
+
*/
|
46 |
+
|
47 |
+
/**
|
48 |
+
* @license React
|
49 |
+
* scheduler.production.min.js
|
50 |
+
*
|
51 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
52 |
+
*
|
53 |
+
* This source code is licensed under the MIT license found in the
|
54 |
+
* LICENSE file in the root directory of this source tree.
|
55 |
+
*/
|
56 |
+
|
57 |
+
/** @license React v16.13.1
|
58 |
+
* react-is.production.min.js
|
59 |
+
*
|
60 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
61 |
+
*
|
62 |
+
* This source code is licensed under the MIT license found in the
|
63 |
+
* LICENSE file in the root directory of this source tree.
|
64 |
+
*/
|
65 |
+
|
66 |
+
/** @license React v16.14.0
|
67 |
+
* react.production.min.js
|
68 |
+
*
|
69 |
+
* Copyright (c) Facebook, Inc. and its affiliates.
|
70 |
+
*
|
71 |
+
* This source code is licensed under the MIT license found in the
|
72 |
+
* LICENSE file in the root directory of this source tree.
|
73 |
+
*/
|
components/tabs/frontend/build/static/js/main.e6b754d8.js.map
ADDED
The diff for this file is too large to render.
See raw diff
|
|
components/tabs/frontend/src/Tabs.tsx
CHANGED
@@ -10,6 +10,7 @@ import Tab from "@mui/material/Tab"
|
|
10 |
import Box from "@mui/material/Box"
|
11 |
import { ThemeProvider, createTheme } from "@mui/material"
|
12 |
import { orange } from "@mui/material/colors"
|
|
|
13 |
|
14 |
const theme = createTheme({
|
15 |
palette: {
|
@@ -55,18 +56,38 @@ function BasicTabs({
|
|
55 |
</Tabs>
|
56 |
</Box>
|
57 |
</Box>
|
58 |
-
<
|
59 |
-
|
60 |
-
variant="outlined"
|
61 |
-
href={
|
62 |
json
|
63 |
-
?
|
64 |
-
: ""
|
65 |
}
|
66 |
-
|
67 |
>
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
</div>
|
71 |
)
|
72 |
}
|
|
|
10 |
import Box from "@mui/material/Box"
|
11 |
import { ThemeProvider, createTheme } from "@mui/material"
|
12 |
import { orange } from "@mui/material/colors"
|
13 |
+
import Tooltip from "@mui/material/Tooltip"
|
14 |
|
15 |
const theme = createTheme({
|
16 |
palette: {
|
|
|
56 |
</Tabs>
|
57 |
</Box>
|
58 |
</Box>
|
59 |
+
<Tooltip
|
60 |
+
title={
|
|
|
|
|
61 |
json
|
62 |
+
? "Download the Croissant JSON-LD file."
|
63 |
+
: "Go to the overview to understand why the Croissant JSON-LD file cannot be generated."
|
64 |
}
|
65 |
+
placement="left"
|
66 |
>
|
67 |
+
<span>
|
68 |
+
<Button
|
69 |
+
disabled={!json}
|
70 |
+
disableElevation
|
71 |
+
variant="contained"
|
72 |
+
href={
|
73 |
+
json
|
74 |
+
? `data:text/json;charset=utf-8,${encodeURIComponent(
|
75 |
+
json.content
|
76 |
+
)}`
|
77 |
+
: ""
|
78 |
+
}
|
79 |
+
download={json ? json.name : ""}
|
80 |
+
sx={{
|
81 |
+
color: "white",
|
82 |
+
padding: "6px 20px",
|
83 |
+
textAlign: "center",
|
84 |
+
whiteSpace: "nowrap",
|
85 |
+
}}
|
86 |
+
>
|
87 |
+
Download 🥐 file
|
88 |
+
</Button>
|
89 |
+
</span>
|
90 |
+
</Tooltip>
|
91 |
</div>
|
92 |
)
|
93 |
}
|
core/constants.py
CHANGED
@@ -35,3 +35,8 @@ METADATA = "Metadata"
|
|
35 |
RESOURCES = "Resources"
|
36 |
RECORD_SETS = "Record Sets"
|
37 |
TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
|
|
|
|
|
|
|
|
|
|
|
|
35 |
RESOURCES = "Resources"
|
36 |
RECORD_SETS = "Record Sets"
|
37 |
TABS = [OVERVIEW, METADATA, RESOURCES, RECORD_SETS]
|
38 |
+
|
39 |
+
NAMES_INFO = (
|
40 |
+
"Names are used as identifiers. They are unique and cannot contain special"
|
41 |
+
" characters. The interface will replace any special characters."
|
42 |
+
)
|
core/files.py
CHANGED
@@ -142,7 +142,10 @@ def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.Data
|
|
142 |
elif file_type == FileTypes.PARQUET:
|
143 |
df = pd.read_parquet(file)
|
144 |
else:
|
145 |
-
raise NotImplementedError(
|
|
|
|
|
|
|
146 |
return df.infer_objects()
|
147 |
|
148 |
|
|
|
142 |
elif file_type == FileTypes.PARQUET:
|
143 |
df = pd.read_parquet(file)
|
144 |
else:
|
145 |
+
raise NotImplementedError(
|
146 |
+
f"File type {file_type} is not supported. Please, open an issue on GitHub:"
|
147 |
+
" https://github.com/mlcommons/croissant/issues/new"
|
148 |
+
)
|
149 |
return df.infer_objects()
|
150 |
|
151 |
|
core/names.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1 |
"""Module to handle naming of RecordSets and distribution."""
|
2 |
|
|
|
|
|
|
|
|
|
3 |
|
4 |
def find_unique_name(names: set[str], name: str):
|
5 |
"""Find a unique UID."""
|
|
|
6 |
while name in names:
|
7 |
name = f"{name}_0"
|
8 |
return name
|
|
|
1 |
"""Module to handle naming of RecordSets and distribution."""
|
2 |
|
3 |
+
import re
|
4 |
+
|
5 |
+
NAME_PATTERN_REGEX = "[^a-zA-Z0-9\\-_\\.]"
|
6 |
+
|
7 |
|
8 |
def find_unique_name(names: set[str], name: str):
|
9 |
"""Find a unique UID."""
|
10 |
+
name = re.sub(NAME_PATTERN_REGEX, "_", name)
|
11 |
while name in names:
|
12 |
name = f"{name}_0"
|
13 |
return name
|
core/names_test.py
CHANGED
@@ -5,6 +5,7 @@ from .names import find_unique_name
|
|
5 |
|
6 |
def test_find_unique_name():
|
7 |
names = set(["first", "second", "first_0"])
|
|
|
8 |
assert find_unique_name(names, "first") == "first_0_0"
|
9 |
assert find_unique_name(names, "second") == "second_0"
|
10 |
assert find_unique_name(names, "third") == "third"
|
|
|
5 |
|
6 |
def test_find_unique_name():
|
7 |
names = set(["first", "second", "first_0"])
|
8 |
+
assert find_unique_name(names, "are there spaces") == "are_there_spaces"
|
9 |
assert find_unique_name(names, "first") == "first_0_0"
|
10 |
assert find_unique_name(names, "second") == "second_0"
|
11 |
assert find_unique_name(names, "third") == "third"
|
events/metadata.py
CHANGED
@@ -2,6 +2,7 @@ import enum
|
|
2 |
|
3 |
import streamlit as st
|
4 |
|
|
|
5 |
from core.state import Metadata
|
6 |
|
7 |
# List from:
|
@@ -97,7 +98,7 @@ class MetadataEvent(enum.Enum):
|
|
97 |
|
98 |
def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
|
99 |
if event == MetadataEvent.NAME:
|
100 |
-
metadata.name = st.session_state[key]
|
101 |
elif event == MetadataEvent.DESCRIPTION:
|
102 |
metadata.description = st.session_state[key]
|
103 |
elif event == MetadataEvent.LICENSE:
|
|
|
2 |
|
3 |
import streamlit as st
|
4 |
|
5 |
+
from core.names import find_unique_name
|
6 |
from core.state import Metadata
|
7 |
|
8 |
# List from:
|
|
|
98 |
|
99 |
def handle_metadata_change(event: MetadataEvent, metadata: Metadata, key: str):
|
100 |
if event == MetadataEvent.NAME:
|
101 |
+
metadata.name = find_unique_name(set(), st.session_state[key])
|
102 |
elif event == MetadataEvent.DESCRIPTION:
|
103 |
metadata.description = st.session_state[key]
|
104 |
elif event == MetadataEvent.LICENSE:
|
views/files.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
-
from etils import epath
|
2 |
import streamlit as st
|
3 |
|
|
|
4 |
from components.tree import render_tree
|
5 |
from core.constants import DF_HEIGHT
|
|
|
6 |
from core.constants import OAUTH_CLIENT_ID
|
7 |
from core.files import code_to_index
|
8 |
from core.files import file_from_form
|
@@ -188,9 +189,10 @@ def _render_resource_details(selected_file: Resource):
|
|
188 |
|
189 |
col1, col2 = st.columns([1, 1])
|
190 |
col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
|
191 |
-
col2
|
192 |
-
|
193 |
-
|
|
|
194 |
|
195 |
|
196 |
def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
@@ -201,6 +203,11 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
201 |
default=file.contained_in,
|
202 |
options=parent_options,
|
203 |
key=key,
|
|
|
|
|
|
|
|
|
|
|
204 |
on_change=handle_resource_change,
|
205 |
args=(ResourceEvent.CONTAINED_IN, file, key),
|
206 |
)
|
@@ -209,6 +216,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
209 |
needed_field("Name"),
|
210 |
value=file.name,
|
211 |
key=key,
|
|
|
212 |
on_change=handle_resource_change,
|
213 |
args=(ResourceEvent.NAME, file, key),
|
214 |
)
|
@@ -224,9 +232,10 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
224 |
if is_file_object:
|
225 |
key = f"{prefix}_content_url"
|
226 |
st.text_input(
|
227 |
-
needed_field("Content URL"),
|
228 |
value=file.content_url,
|
229 |
key=key,
|
|
|
230 |
on_change=handle_resource_change,
|
231 |
args=(ResourceEvent.CONTENT_URL, file, key),
|
232 |
)
|
@@ -243,6 +252,7 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
243 |
"Content size",
|
244 |
value=file.content_size,
|
245 |
key=key,
|
|
|
246 |
on_change=handle_resource_change,
|
247 |
args=(ResourceEvent.CONTENT_SIZE, file, key),
|
248 |
)
|
@@ -261,6 +271,10 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
261 |
index=code_to_index(file.encoding_format),
|
262 |
options=FILE_TYPES.keys(),
|
263 |
key=key,
|
|
|
|
|
|
|
|
|
264 |
on_change=handle_resource_change,
|
265 |
args=(ResourceEvent.ENCODING_FORMAT, file, key),
|
266 |
)
|
@@ -269,4 +283,9 @@ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
269 |
if file.df is not None:
|
270 |
st.dataframe(file.df, height=DF_HEIGHT)
|
271 |
else:
|
272 |
-
st.button(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
+
from components.safe_button import button_with_confirmation
|
4 |
from components.tree import render_tree
|
5 |
from core.constants import DF_HEIGHT
|
6 |
+
from core.constants import NAMES_INFO
|
7 |
from core.constants import OAUTH_CLIENT_ID
|
8 |
from core.files import code_to_index
|
9 |
from core.files import file_from_form
|
|
|
189 |
|
190 |
col1, col2 = st.columns([1, 1])
|
191 |
col1.button("Close", key=f"{i}_close", on_click=close, type="primary")
|
192 |
+
with col2:
|
193 |
+
button_with_confirmation(
|
194 |
+
"Remove", key=f"{i}_remove", on_click=delete_line
|
195 |
+
)
|
196 |
|
197 |
|
198 |
def _render_resource(prefix: int, file: Resource, is_file_object: bool):
|
|
|
203 |
default=file.contained_in,
|
204 |
options=parent_options,
|
205 |
key=key,
|
206 |
+
help=(
|
207 |
+
"FileObjects and FileSets can be nested. Specifying `Parents` allows to"
|
208 |
+
" nest a FileObject/FileSet within another FileObject/FileSet. An example"
|
209 |
+
" of this is when images (FileSet) are nested within an archive (FileSet)."
|
210 |
+
),
|
211 |
on_change=handle_resource_change,
|
212 |
args=(ResourceEvent.CONTAINED_IN, file, key),
|
213 |
)
|
|
|
216 |
needed_field("Name"),
|
217 |
value=file.name,
|
218 |
key=key,
|
219 |
+
help=f"The name of the resource. {NAMES_INFO}",
|
220 |
on_change=handle_resource_change,
|
221 |
args=(ResourceEvent.NAME, file, key),
|
222 |
)
|
|
|
232 |
if is_file_object:
|
233 |
key = f"{prefix}_content_url"
|
234 |
st.text_input(
|
235 |
+
needed_field("Content URL or local path"),
|
236 |
value=file.content_url,
|
237 |
key=key,
|
238 |
+
help="The URL or local file path pointing to the original FileObject.",
|
239 |
on_change=handle_resource_change,
|
240 |
args=(ResourceEvent.CONTENT_URL, file, key),
|
241 |
)
|
|
|
252 |
"Content size",
|
253 |
value=file.content_size,
|
254 |
key=key,
|
255 |
+
help="The size of the original FileObject in bytes.",
|
256 |
on_change=handle_resource_change,
|
257 |
args=(ResourceEvent.CONTENT_SIZE, file, key),
|
258 |
)
|
|
|
271 |
index=code_to_index(file.encoding_format),
|
272 |
options=FILE_TYPES.keys(),
|
273 |
key=key,
|
274 |
+
help=(
|
275 |
+
"MIME type corresponding to"
|
276 |
+
" ([sc:encodingFormat](https://schema.org/encodingFormat))."
|
277 |
+
),
|
278 |
on_change=handle_resource_change,
|
279 |
args=(ResourceEvent.ENCODING_FORMAT, file, key),
|
280 |
)
|
|
|
283 |
if file.df is not None:
|
284 |
st.dataframe(file.df, height=DF_HEIGHT)
|
285 |
else:
|
286 |
+
st.button(
|
287 |
+
"Trigger download",
|
288 |
+
disabled=bool(file.content_url),
|
289 |
+
on_click=trigger_download,
|
290 |
+
args=(file,),
|
291 |
+
)
|
views/overview.py
CHANGED
@@ -3,6 +3,7 @@ from typing import Any
|
|
3 |
|
4 |
import streamlit as st
|
5 |
|
|
|
6 |
from core.state import Metadata
|
7 |
import mlcroissant as mlc
|
8 |
from utils import needed_field
|
@@ -51,6 +52,7 @@ def render_overview():
|
|
51 |
label=needed_field("Name"),
|
52 |
key=key,
|
53 |
value=metadata.name,
|
|
|
54 |
placeholder="Dataset",
|
55 |
on_change=handle_metadata_change,
|
56 |
args=(MetadataEvent.NAME, metadata, key),
|
@@ -82,8 +84,15 @@ def render_overview():
|
|
82 |
* 100
|
83 |
/ (3 * metadata_weight)
|
84 |
)
|
85 |
-
col_a.metric(
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
col_c.metric("Number of resources", len(metadata.distribution))
|
88 |
col_d.metric("Number of RecordSets", len(metadata.record_sets))
|
89 |
with col2:
|
@@ -96,10 +105,6 @@ def render_overview():
|
|
96 |
warning += "**Errors**\n"
|
97 |
for error in issues.errors:
|
98 |
warning += f"{error}\n"
|
99 |
-
if issues.warnings:
|
100 |
-
warning += "**Warnings**\n"
|
101 |
-
for warning in issues.warnings:
|
102 |
-
warning += f"{warning}\n"
|
103 |
except mlc.ValidationError as exception:
|
104 |
warning += "**Errors**\n"
|
105 |
warning += f"{str(exception)}\n"
|
|
|
3 |
|
4 |
import streamlit as st
|
5 |
|
6 |
+
from core.constants import NAMES_INFO
|
7 |
from core.state import Metadata
|
8 |
import mlcroissant as mlc
|
9 |
from utils import needed_field
|
|
|
52 |
label=needed_field("Name"),
|
53 |
key=key,
|
54 |
value=metadata.name,
|
55 |
+
help=f"The name of the dataset. {NAMES_INFO}",
|
56 |
placeholder="Dataset",
|
57 |
on_change=handle_metadata_change,
|
58 |
args=(MetadataEvent.NAME, metadata, key),
|
|
|
84 |
* 100
|
85 |
/ (3 * metadata_weight)
|
86 |
)
|
87 |
+
col_a.metric(
|
88 |
+
"Completion",
|
89 |
+
f"{completion}%",
|
90 |
+
help=(
|
91 |
+
"Approximation of the total completion based on the number of fields"
|
92 |
+
" that are filled."
|
93 |
+
),
|
94 |
+
)
|
95 |
+
col_b.metric("Number of metadata fields", fields)
|
96 |
col_c.metric("Number of resources", len(metadata.distribution))
|
97 |
col_d.metric("Number of RecordSets", len(metadata.record_sets))
|
98 |
with col2:
|
|
|
105 |
warning += "**Errors**\n"
|
106 |
for error in issues.errors:
|
107 |
warning += f"{error}\n"
|
|
|
|
|
|
|
|
|
108 |
except mlc.ValidationError as exception:
|
109 |
warning += "**Errors**\n"
|
110 |
warning += f"{str(exception)}\n"
|
views/record_sets.py
CHANGED
@@ -9,6 +9,8 @@ import pandas as pd
|
|
9 |
from rdflib import term
|
10 |
import streamlit as st
|
11 |
|
|
|
|
|
12 |
from core.data_types import MLC_DATA_TYPES
|
13 |
from core.data_types import mlc_to_str_data_type
|
14 |
from core.data_types import STR_DATA_TYPES
|
@@ -239,6 +241,7 @@ def _render_left_panel():
|
|
239 |
needed_field("Name"),
|
240 |
placeholder="Name without special character.",
|
241 |
key=key,
|
|
|
242 |
value=record_set.name,
|
243 |
on_change=handle_record_set_change,
|
244 |
args=(RecordSetEvent.NAME, record_set, key),
|
@@ -254,16 +257,24 @@ def _render_left_panel():
|
|
254 |
)
|
255 |
key = f"{prefix}-is-enumeration"
|
256 |
st.checkbox(
|
257 |
-
"
|
258 |
key=key,
|
|
|
|
|
|
|
|
|
259 |
value=record_set.is_enumeration,
|
260 |
on_change=handle_record_set_change,
|
261 |
args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
|
262 |
)
|
263 |
key = f"{prefix}-has-data"
|
264 |
st.checkbox(
|
265 |
-
"
|
266 |
key=key,
|
|
|
|
|
|
|
|
|
267 |
value=bool(record_set.data),
|
268 |
on_change=handle_record_set_change,
|
269 |
args=(RecordSetEvent.HAS_DATA, record_set, key),
|
@@ -271,7 +282,7 @@ def _render_left_panel():
|
|
271 |
|
272 |
joins = _find_joins(record_set.fields)
|
273 |
has_join = st.checkbox(
|
274 |
-
"
|
275 |
" with a source in `RecordSet`/`FileSet`/`FileObject` and a reference"
|
276 |
" to another `RecordSet`/`FileSet`/`FileObject`.",
|
277 |
key=f"{prefix}-has-joins",
|
@@ -323,8 +334,14 @@ def _render_left_panel():
|
|
323 |
)
|
324 |
data_editor_key = _data_editor_key(record_set_key, record_set)
|
325 |
st.markdown(
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
)
|
329 |
st.data_editor(
|
330 |
fields,
|
@@ -380,9 +397,8 @@ def _render_left_panel():
|
|
380 |
args=(record_set_key, record_set),
|
381 |
)
|
382 |
key = f"{prefix}-delete-record-set"
|
383 |
-
|
384 |
-
"
|
385 |
-
type="primary",
|
386 |
key=key,
|
387 |
on_click=_handle_remove_record_set,
|
388 |
args=(record_set_key,),
|
@@ -437,6 +453,7 @@ def _render_right_panel():
|
|
437 |
needed_field("Name"),
|
438 |
placeholder="Name without special character.",
|
439 |
key=key,
|
|
|
440 |
value=field.name,
|
441 |
on_change=handle_field_change,
|
442 |
args=(FieldEvent.NAME, field, key),
|
@@ -450,32 +467,29 @@ def _render_right_panel():
|
|
450 |
value=field.description,
|
451 |
args=(FieldEvent.DESCRIPTION, field, key),
|
452 |
)
|
|
|
453 |
if field.data_types:
|
454 |
data_type = field.data_types[0]
|
455 |
if isinstance(data_type, str):
|
456 |
data_type = term.URIRef(data_type)
|
457 |
if data_type in MLC_DATA_TYPES:
|
458 |
data_type_index = MLC_DATA_TYPES.index(data_type)
|
459 |
-
else:
|
460 |
-
data_type_index = None
|
461 |
-
else:
|
462 |
-
data_type_index = None
|
463 |
key = f"{prefix}-datatypes"
|
464 |
col3.selectbox(
|
465 |
needed_field("Data type"),
|
466 |
index=data_type_index,
|
467 |
options=STR_DATA_TYPES,
|
468 |
key=key,
|
|
|
|
|
|
|
|
|
469 |
on_change=handle_field_change,
|
470 |
args=(FieldEvent.DATA_TYPE, field, key),
|
471 |
)
|
472 |
possible_sources = _get_possible_sources(metadata)
|
473 |
-
render_source(
|
474 |
-
|
475 |
-
)
|
476 |
-
render_references(
|
477 |
-
record_set_key, record_set, field, field_key, possible_sources
|
478 |
-
)
|
479 |
|
480 |
st.divider()
|
481 |
|
|
|
9 |
from rdflib import term
|
10 |
import streamlit as st
|
11 |
|
12 |
+
from components.safe_button import button_with_confirmation
|
13 |
+
from core.constants import NAMES_INFO
|
14 |
from core.data_types import MLC_DATA_TYPES
|
15 |
from core.data_types import mlc_to_str_data_type
|
16 |
from core.data_types import STR_DATA_TYPES
|
|
|
241 |
needed_field("Name"),
|
242 |
placeholder="Name without special character.",
|
243 |
key=key,
|
244 |
+
help=f"The name of the RecordSet. {NAMES_INFO}",
|
245 |
value=record_set.name,
|
246 |
on_change=handle_record_set_change,
|
247 |
args=(RecordSetEvent.NAME, record_set, key),
|
|
|
257 |
)
|
258 |
key = f"{prefix}-is-enumeration"
|
259 |
st.checkbox(
|
260 |
+
"The RecordSet is an enumeration",
|
261 |
key=key,
|
262 |
+
help=(
|
263 |
+
"Enumerations indicate that the RecordSet takes its values in a"
|
264 |
+
" finite set. Similar to `ClassLabel` in TFDS or Hugging Face."
|
265 |
+
),
|
266 |
value=record_set.is_enumeration,
|
267 |
on_change=handle_record_set_change,
|
268 |
args=(RecordSetEvent.IS_ENUMERATION, record_set, key),
|
269 |
)
|
270 |
key = f"{prefix}-has-data"
|
271 |
st.checkbox(
|
272 |
+
"The RecordSet has in-line data",
|
273 |
key=key,
|
274 |
+
help=(
|
275 |
+
"In-line data allows to embed data directly within the JSON-LD"
|
276 |
+
" without referencing another data source."
|
277 |
+
),
|
278 |
value=bool(record_set.data),
|
279 |
on_change=handle_record_set_change,
|
280 |
args=(RecordSetEvent.HAS_DATA, record_set, key),
|
|
|
282 |
|
283 |
joins = _find_joins(record_set.fields)
|
284 |
has_join = st.checkbox(
|
285 |
+
"The RecordSet contains joins. To add a new join, add a field"
|
286 |
" with a source in `RecordSet`/`FileSet`/`FileObject` and a reference"
|
287 |
" to another `RecordSet`/`FileSet`/`FileObject`.",
|
288 |
key=f"{prefix}-has-joins",
|
|
|
334 |
)
|
335 |
data_editor_key = _data_editor_key(record_set_key, record_set)
|
336 |
st.markdown(
|
337 |
+
needed_field("Fields"),
|
338 |
+
help=(
|
339 |
+
"Add/delete fields by directly editing the table. Warning: the"
|
340 |
+
" table contains information about the fields--not the data"
|
341 |
+
" directly. If you wish to embed data, select `The RecordSet is an"
|
342 |
+
" enumeration` above. To edit fields details, click the button"
|
343 |
+
" `Edit fields details` below."
|
344 |
+
),
|
345 |
)
|
346 |
st.data_editor(
|
347 |
fields,
|
|
|
397 |
args=(record_set_key, record_set),
|
398 |
)
|
399 |
key = f"{prefix}-delete-record-set"
|
400 |
+
button_with_confirmation(
|
401 |
+
"Delete RecordSet",
|
|
|
402 |
key=key,
|
403 |
on_click=_handle_remove_record_set,
|
404 |
args=(record_set_key,),
|
|
|
453 |
needed_field("Name"),
|
454 |
placeholder="Name without special character.",
|
455 |
key=key,
|
456 |
+
help=f"The name of the field. {NAMES_INFO}",
|
457 |
value=field.name,
|
458 |
on_change=handle_field_change,
|
459 |
args=(FieldEvent.NAME, field, key),
|
|
|
467 |
value=field.description,
|
468 |
args=(FieldEvent.DESCRIPTION, field, key),
|
469 |
)
|
470 |
+
data_type_index = None
|
471 |
if field.data_types:
|
472 |
data_type = field.data_types[0]
|
473 |
if isinstance(data_type, str):
|
474 |
data_type = term.URIRef(data_type)
|
475 |
if data_type in MLC_DATA_TYPES:
|
476 |
data_type_index = MLC_DATA_TYPES.index(data_type)
|
|
|
|
|
|
|
|
|
477 |
key = f"{prefix}-datatypes"
|
478 |
col3.selectbox(
|
479 |
needed_field("Data type"),
|
480 |
index=data_type_index,
|
481 |
options=STR_DATA_TYPES,
|
482 |
key=key,
|
483 |
+
help=(
|
484 |
+
"The type of the data. `Text` corresponds to"
|
485 |
+
" https://schema.org/Text, etc."
|
486 |
+
),
|
487 |
on_change=handle_field_change,
|
488 |
args=(FieldEvent.DATA_TYPE, field, key),
|
489 |
)
|
490 |
possible_sources = _get_possible_sources(metadata)
|
491 |
+
render_source(record_set, field, possible_sources)
|
492 |
+
render_references(record_set, field, possible_sources)
|
|
|
|
|
|
|
|
|
493 |
|
494 |
st.divider()
|
495 |
|
views/source.py
CHANGED
@@ -12,6 +12,15 @@ from events.fields import TransformType
|
|
12 |
import mlcroissant as mlc
|
13 |
from utils import needed_field
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
class SourceType:
|
17 |
"""The type of the source (distribution or field)."""
|
@@ -105,10 +114,8 @@ def _handle_remove_reference(field):
|
|
105 |
|
106 |
|
107 |
def render_source(
|
108 |
-
record_set_key: int,
|
109 |
record_set: RecordSet,
|
110 |
field: Field,
|
111 |
-
field_key: int,
|
112 |
possible_sources: list[str],
|
113 |
):
|
114 |
"""Renders the form for the source."""
|
@@ -123,10 +130,13 @@ def render_source(
|
|
123 |
index = None
|
124 |
key = f"{prefix}-source"
|
125 |
col1.selectbox(
|
126 |
-
needed_field("
|
127 |
index=index,
|
128 |
options=options,
|
129 |
key=key,
|
|
|
|
|
|
|
130 |
on_change=handle_field_change,
|
131 |
args=(FieldEvent.SOURCE, field, key),
|
132 |
)
|
@@ -135,6 +145,7 @@ def render_source(
|
|
135 |
needed_field("Extract"),
|
136 |
index=_get_extract_index(source),
|
137 |
key=f"{prefix}-extract",
|
|
|
138 |
options=EXTRACT_TYPES,
|
139 |
on_change=handle_field_change,
|
140 |
args=(FieldEvent.SOURCE_EXTRACT, field, key),
|
@@ -145,6 +156,7 @@ def render_source(
|
|
145 |
needed_field("Column name"),
|
146 |
value=source.extract.column,
|
147 |
key=key,
|
|
|
148 |
on_change=handle_field_change,
|
149 |
args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
|
150 |
)
|
@@ -154,6 +166,7 @@ def render_source(
|
|
154 |
needed_field("JSON path"),
|
155 |
value=source.extract.json_path,
|
156 |
key=key,
|
|
|
157 |
on_change=handle_field_change,
|
158 |
args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
|
159 |
)
|
@@ -170,18 +183,23 @@ def render_source(
|
|
170 |
key=key,
|
171 |
options=TRANSFORM_TYPES,
|
172 |
on_change=handle_field_change,
|
|
|
173 |
args=(FieldEvent.TRANSFORM, field, key),
|
174 |
kwargs={"number": number},
|
175 |
)
|
176 |
if selected == TransformType.FORMAT:
|
177 |
key = f"{prefix}-{number}-transform-format"
|
178 |
col3.text_input(
|
179 |
-
needed_field("Format"),
|
180 |
value=transform.format,
|
181 |
key=key,
|
182 |
on_change=handle_field_change,
|
|
|
|
|
|
|
|
|
183 |
args=(selected, field, key),
|
184 |
-
kwargs={"number": number
|
185 |
)
|
186 |
elif selected == TransformType.JSON_PATH:
|
187 |
key = f"{prefix}-{number}-jsonpath"
|
@@ -190,8 +208,9 @@ def render_source(
|
|
190 |
value=transform.json_path,
|
191 |
key=key,
|
192 |
on_change=handle_field_change,
|
|
|
193 |
args=(selected, field, key),
|
194 |
-
kwargs={"number": number
|
195 |
)
|
196 |
elif selected == TransformType.REGEX:
|
197 |
key = f"{prefix}-{number}-regex"
|
@@ -200,8 +219,14 @@ def render_source(
|
|
200 |
value=transform.regex,
|
201 |
key=key,
|
202 |
on_change=handle_field_change,
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
args=(selected, field, key),
|
204 |
-
kwargs={"number": number
|
205 |
)
|
206 |
elif selected == TransformType.REPLACE:
|
207 |
key = f"{prefix}-{number}-replace"
|
@@ -210,8 +235,13 @@ def render_source(
|
|
210 |
value=transform.replace,
|
211 |
key=key,
|
212 |
on_change=handle_field_change,
|
|
|
|
|
|
|
|
|
|
|
213 |
args=(selected, field, key),
|
214 |
-
kwargs={"number": number
|
215 |
)
|
216 |
elif selected == TransformType.SEPARATOR:
|
217 |
key = f"{prefix}-{number}-separator"
|
@@ -220,8 +250,9 @@ def render_source(
|
|
220 |
value=transform.separator,
|
221 |
key=key,
|
222 |
on_change=handle_field_change,
|
|
|
223 |
args=(selected, field, key),
|
224 |
-
kwargs={"number": number
|
225 |
)
|
226 |
|
227 |
def _handle_remove_transform(field, number):
|
@@ -230,6 +261,7 @@ def render_source(
|
|
230 |
col4.button(
|
231 |
"✖️",
|
232 |
key=f"{prefix}-{number}-remove-transform",
|
|
|
233 |
on_click=_handle_remove_transform,
|
234 |
args=(field, number),
|
235 |
)
|
@@ -243,16 +275,15 @@ def render_source(
|
|
243 |
col1.button(
|
244 |
"Add transform on data",
|
245 |
key=f"{prefix}-close-fields",
|
|
|
246 |
on_click=_handle_add_transform,
|
247 |
args=(field,),
|
248 |
)
|
249 |
|
250 |
|
251 |
def render_references(
|
252 |
-
record_set_key: int,
|
253 |
record_set: RecordSet,
|
254 |
field: Field,
|
255 |
-
field_key: int,
|
256 |
possible_sources: list[str],
|
257 |
):
|
258 |
"""Renders the form for references."""
|
@@ -286,6 +317,7 @@ def render_references(
|
|
286 |
index=_get_extract_index(references),
|
287 |
key=key,
|
288 |
options=EXTRACT_TYPES,
|
|
|
289 |
on_change=handle_field_change,
|
290 |
args=(FieldEvent.REFERENCE_EXTRACT, field, key),
|
291 |
)
|
@@ -295,6 +327,7 @@ def render_references(
|
|
295 |
needed_field("Column name"),
|
296 |
value=references.extract.column,
|
297 |
key=key,
|
|
|
298 |
on_change=handle_field_change,
|
299 |
args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
|
300 |
)
|
@@ -304,12 +337,14 @@ def render_references(
|
|
304 |
needed_field("JSON path"),
|
305 |
value=references.extract.json_path,
|
306 |
key=key,
|
|
|
307 |
on_change=handle_field_change,
|
308 |
args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
|
309 |
)
|
310 |
col4.button(
|
311 |
"✖️",
|
312 |
key=f"{key}-remove-reference",
|
|
|
313 |
on_click=_handle_remove_reference,
|
314 |
args=(field,),
|
315 |
)
|
|
|
12 |
import mlcroissant as mlc
|
13 |
from utils import needed_field
|
14 |
|
15 |
+
_JSON_PATH_DOCUMENTATION = (
|
16 |
+
"The JSON path if the data source is a JSON (see"
|
17 |
+
" [documentation](https://www.ietf.org/archive/id/draft-goessner-dispatch-jsonpath-00.html))."
|
18 |
+
)
|
19 |
+
_EXTRACT_DOCUMENTATION = (
|
20 |
+
"The extraction method to get the value of the field (column in a CSV, etc)."
|
21 |
+
)
|
22 |
+
_COLUMN_NAME_DOCUMENTATION = "The name of the column if the data source is a CSV."
|
23 |
+
|
24 |
|
25 |
class SourceType:
|
26 |
"""The type of the source (distribution or field)."""
|
|
|
114 |
|
115 |
|
116 |
def render_source(
|
|
|
117 |
record_set: RecordSet,
|
118 |
field: Field,
|
|
|
119 |
possible_sources: list[str],
|
120 |
):
|
121 |
"""Renders the form for the source."""
|
|
|
130 |
index = None
|
131 |
key = f"{prefix}-source"
|
132 |
col1.selectbox(
|
133 |
+
needed_field("Data source"),
|
134 |
index=index,
|
135 |
options=options,
|
136 |
key=key,
|
137 |
+
help=(
|
138 |
+
"Data sources can be other resources (FileObject, FileSet) or other fields."
|
139 |
+
),
|
140 |
on_change=handle_field_change,
|
141 |
args=(FieldEvent.SOURCE, field, key),
|
142 |
)
|
|
|
145 |
needed_field("Extract"),
|
146 |
index=_get_extract_index(source),
|
147 |
key=f"{prefix}-extract",
|
148 |
+
help=_EXTRACT_DOCUMENTATION,
|
149 |
options=EXTRACT_TYPES,
|
150 |
on_change=handle_field_change,
|
151 |
args=(FieldEvent.SOURCE_EXTRACT, field, key),
|
|
|
156 |
needed_field("Column name"),
|
157 |
value=source.extract.column,
|
158 |
key=key,
|
159 |
+
help=_COLUMN_NAME_DOCUMENTATION,
|
160 |
on_change=handle_field_change,
|
161 |
args=(FieldEvent.SOURCE_EXTRACT_COLUMN, field, key),
|
162 |
)
|
|
|
166 |
needed_field("JSON path"),
|
167 |
value=source.extract.json_path,
|
168 |
key=key,
|
169 |
+
help=_JSON_PATH_DOCUMENTATION,
|
170 |
on_change=handle_field_change,
|
171 |
args=(FieldEvent.SOURCE_EXTRACT_JSON_PATH, field, key),
|
172 |
)
|
|
|
183 |
key=key,
|
184 |
options=TRANSFORM_TYPES,
|
185 |
on_change=handle_field_change,
|
186 |
+
help="One or more transformations to apply after extracting the field.",
|
187 |
args=(FieldEvent.TRANSFORM, field, key),
|
188 |
kwargs={"number": number},
|
189 |
)
|
190 |
if selected == TransformType.FORMAT:
|
191 |
key = f"{prefix}-{number}-transform-format"
|
192 |
col3.text_input(
|
193 |
+
needed_field("Format a date"),
|
194 |
value=transform.format,
|
195 |
key=key,
|
196 |
on_change=handle_field_change,
|
197 |
+
help=(
|
198 |
+
"For dates, use [`Python format"
|
199 |
+
" codes`](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes)."
|
200 |
+
),
|
201 |
args=(selected, field, key),
|
202 |
+
kwargs={"number": number},
|
203 |
)
|
204 |
elif selected == TransformType.JSON_PATH:
|
205 |
key = f"{prefix}-{number}-jsonpath"
|
|
|
208 |
value=transform.json_path,
|
209 |
key=key,
|
210 |
on_change=handle_field_change,
|
211 |
+
help=_JSON_PATH_DOCUMENTATION,
|
212 |
args=(selected, field, key),
|
213 |
+
kwargs={"number": number},
|
214 |
)
|
215 |
elif selected == TransformType.REGEX:
|
216 |
key = f"{prefix}-{number}-regex"
|
|
|
219 |
value=transform.regex,
|
220 |
key=key,
|
221 |
on_change=handle_field_change,
|
222 |
+
help=(
|
223 |
+
"A regular expression following [`re` Python"
|
224 |
+
" convention](https://docs.python.org/3/library/re.html#regular-expression-syntax)"
|
225 |
+
" with one capturing group. The result of the operation will be"
|
226 |
+
" the last captured group."
|
227 |
+
),
|
228 |
args=(selected, field, key),
|
229 |
+
kwargs={"number": number},
|
230 |
)
|
231 |
elif selected == TransformType.REPLACE:
|
232 |
key = f"{prefix}-{number}-replace"
|
|
|
235 |
value=transform.replace,
|
236 |
key=key,
|
237 |
on_change=handle_field_change,
|
238 |
+
help=(
|
239 |
+
"A replace pattern separated by a `/`, i.e."
|
240 |
+
" `string_to_replace/string_to_substitute` in order to replace"
|
241 |
+
" `string_to_replace` by `string_to_substitute`."
|
242 |
+
),
|
243 |
args=(selected, field, key),
|
244 |
+
kwargs={"number": number},
|
245 |
)
|
246 |
elif selected == TransformType.SEPARATOR:
|
247 |
key = f"{prefix}-{number}-separator"
|
|
|
250 |
value=transform.separator,
|
251 |
key=key,
|
252 |
on_change=handle_field_change,
|
253 |
+
help="A separator to split strings on, e.g. `|` to split `a|b|c`.",
|
254 |
args=(selected, field, key),
|
255 |
+
kwargs={"number": number},
|
256 |
)
|
257 |
|
258 |
def _handle_remove_transform(field, number):
|
|
|
261 |
col4.button(
|
262 |
"✖️",
|
263 |
key=f"{prefix}-{number}-remove-transform",
|
264 |
+
help="Remove the transformation.",
|
265 |
on_click=_handle_remove_transform,
|
266 |
args=(field, number),
|
267 |
)
|
|
|
275 |
col1.button(
|
276 |
"Add transform on data",
|
277 |
key=f"{prefix}-close-fields",
|
278 |
+
help="Add a transformation.",
|
279 |
on_click=_handle_add_transform,
|
280 |
args=(field,),
|
281 |
)
|
282 |
|
283 |
|
284 |
def render_references(
|
|
|
285 |
record_set: RecordSet,
|
286 |
field: Field,
|
|
|
287 |
possible_sources: list[str],
|
288 |
):
|
289 |
"""Renders the form for references."""
|
|
|
317 |
index=_get_extract_index(references),
|
318 |
key=key,
|
319 |
options=EXTRACT_TYPES,
|
320 |
+
help=_EXTRACT_DOCUMENTATION,
|
321 |
on_change=handle_field_change,
|
322 |
args=(FieldEvent.REFERENCE_EXTRACT, field, key),
|
323 |
)
|
|
|
327 |
needed_field("Column name"),
|
328 |
value=references.extract.column,
|
329 |
key=key,
|
330 |
+
help=_COLUMN_NAME_DOCUMENTATION,
|
331 |
on_change=handle_field_change,
|
332 |
args=(FieldEvent.REFERENCE_EXTRACT_COLUMN, field, key),
|
333 |
)
|
|
|
337 |
needed_field("JSON path"),
|
338 |
value=references.extract.json_path,
|
339 |
key=key,
|
340 |
+
help=_JSON_PATH_DOCUMENTATION,
|
341 |
on_change=handle_field_change,
|
342 |
args=(FieldEvent.REFERENCE_EXTRACT_JSON_PATH, field, key),
|
343 |
)
|
344 |
col4.button(
|
345 |
"✖️",
|
346 |
key=f"{key}-remove-reference",
|
347 |
+
help="Remove the join.",
|
348 |
on_click=_handle_remove_reference,
|
349 |
args=(field,),
|
350 |
)
|
views/splash.py
CHANGED
@@ -13,6 +13,8 @@ import mlcroissant as mlc
|
|
13 |
from views.load import render_load
|
14 |
from views.previous_files import render_previous_files
|
15 |
|
|
|
|
|
16 |
_DATASETS = {
|
17 |
"Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
|
18 |
"FLORES-200": [],
|
@@ -34,8 +36,6 @@ def render_splash():
|
|
34 |
)
|
35 |
col1, col2 = st.columns([1, 1], gap="large")
|
36 |
with col1:
|
37 |
-
with st.expander("**Load an existing Croissant JSON-LD file**", expanded=True):
|
38 |
-
render_load()
|
39 |
with st.expander("**Create from scratch**", expanded=True):
|
40 |
|
41 |
def create_new_croissant():
|
@@ -81,6 +81,36 @@ def render_splash():
|
|
81 |
type="primary",
|
82 |
args=(dataset,),
|
83 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
with col2:
|
85 |
with st.expander("**Past projects**", expanded=True):
|
86 |
render_previous_files()
|
|
|
13 |
from views.load import render_load
|
14 |
from views.previous_files import render_previous_files
|
15 |
|
16 |
+
_HUGGING_FACE_URL = "https://huggingface.co/datasets/"
|
17 |
+
|
18 |
_DATASETS = {
|
19 |
"Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
|
20 |
"FLORES-200": [],
|
|
|
36 |
)
|
37 |
col1, col2 = st.columns([1, 1], gap="large")
|
38 |
with col1:
|
|
|
|
|
39 |
with st.expander("**Create from scratch**", expanded=True):
|
40 |
|
41 |
def create_new_croissant():
|
|
|
81 |
type="primary",
|
82 |
args=(dataset,),
|
83 |
)
|
84 |
+
with st.expander("**Load a dataset from Hugging Face**", expanded=True):
|
85 |
+
url = st.text_input(
|
86 |
+
label="Hugging Face URL",
|
87 |
+
)
|
88 |
+
if url.startswith(_HUGGING_FACE_URL):
|
89 |
+
url = url.replace(_HUGGING_FACE_URL, "")
|
90 |
+
|
91 |
+
def download_huggingface_json(name: str):
|
92 |
+
api_url = f"https://datasets-server.huggingface.co/croissant?dataset={name}"
|
93 |
+
json = requests.get(api_url, headers=None).json()
|
94 |
+
try:
|
95 |
+
metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
|
96 |
+
st.session_state[Metadata] = Metadata.from_canonical(metadata)
|
97 |
+
save_current_project()
|
98 |
+
except Exception:
|
99 |
+
st.error(f"Malformed JSON: {json}")
|
100 |
+
|
101 |
+
st.button(
|
102 |
+
f'Download "{url}"',
|
103 |
+
on_click=download_huggingface_json,
|
104 |
+
type="primary",
|
105 |
+
args=(url,),
|
106 |
+
)
|
107 |
+
elif url:
|
108 |
+
st.error(
|
109 |
+
f"Unknown URL {url}. Hugging Face URLS should look like"
|
110 |
+
f" {_HUGGING_FACE_URL}somedataset."
|
111 |
+
)
|
112 |
+
with st.expander("**Load an existing Croissant JSON-LD file**", expanded=True):
|
113 |
+
render_load()
|
114 |
with col2:
|
115 |
with st.expander("**Past projects**", expanded=True):
|
116 |
render_previous_files()
|