Spaces:

MLCommons
/

croissant-editor

Runtime error

App Files Files Community

marcenacp commited on Dec 1, 2023

Commit

fe3ba5f

1 Parent(s): e92e659

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files

Files changed (8) hide show

Dockerfile +1 -0
core/files.py +40 -12
core/files_test.py +15 -6
core/past_projects.py +6 -1
requirements.txt +1 -0
views/files.py +10 -11
views/record_sets.py +4 -1
views/splash.py +20 -10

Dockerfile CHANGED Viewed

@@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
     software-properties-common \
     git \
     python3-pip \
     && rm -rf /var/lib/apt/lists/*
 COPY ./ /app/

     software-properties-common \
     git \
     python3-pip \
+    libmagic1 \
     && rm -rf /var/lib/apt/lists/*
 COPY ./ /app/

core/files.py CHANGED Viewed

@@ -4,6 +4,7 @@ import io
 import tempfile
 from etils import epath
 import pandas as pd
 import requests
@@ -83,6 +84,10 @@ FILE_TYPES: dict[str, FileType] = {
     ]
 }
 def name_to_code(file_type_name: str) -> str | None:
     """Maps names to the encoding format: Text => plain/text."""
@@ -127,29 +132,34 @@ def download_file(url: str, file_path: epath.Path):
 def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
     """Gets the df associated to the file."""
     if file_type == FileTypes.CSV:
-        return pd.read_csv(file)
     elif file_type == FileTypes.EXCEL:
-        return pd.read_excel(file)
     elif file_type == FileTypes.JSON:
-        return pd.read_json(file)
     elif file_type == FileTypes.JSONL:
-        return pd.read_json(file, lines=True)
     elif file_type == FileTypes.PARQUET:
-        return pd.read_parquet(file)
     else:
         raise NotImplementedError()
-def file_from_url(
-    file_type: FileType, url: str, names: set[str], folder: epath.Path
-) -> FileObject:
     """Downloads locally and extracts the file information."""
     file_path = hash_file_path(url)
     if not file_path.exists():
         download_file(url, file_path)
     with file_path.open("rb") as file:
         sha256 = _sha256(file.read())
-    df = get_dataframe(file_type, file_path).infer_objects()
     return FileObject(
         name=find_unique_name(names, url.split("/")[-1]),
         description="",
@@ -162,15 +172,17 @@ def file_from_url(
 def file_from_upload(
-    file_type: FileType, file: io.BytesIO, names: set[str], folder: epath.Path
 ) -> FileObject:
     """Uploads locally and extracts the file information."""
     value = file.getvalue()
     content_url = f"data/{file.name}"
     sha256 = _sha256(value)
-    with get_resource_path(content_url).open("wb") as f:
         f.write(value)
-    df = get_dataframe(file_type, file).infer_objects()
     return FileObject(
         name=find_unique_name(names, file.name),
         description="",
@@ -192,3 +204,19 @@ def file_from_form(
         return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
     else:
         raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")

 import tempfile
 from etils import epath
+import magic
 import pandas as pd
 import requests
     ]
 }
+ENCODING_FORMATS: dict[str, FileType] = {
+    file_type.encoding_format: file_type for file_type in FILE_TYPES.values()
+}
 def name_to_code(file_type_name: str) -> str | None:
     """Maps names to the encoding format: Text => plain/text."""
 def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
     """Gets the df associated to the file."""
     if file_type == FileTypes.CSV:
+        df = pd.read_csv(file)
     elif file_type == FileTypes.EXCEL:
+        df = pd.read_excel(file)
     elif file_type == FileTypes.JSON:
+        df = pd.read_json(file)
     elif file_type == FileTypes.JSONL:
+        df = pd.read_json(file, lines=True)
     elif file_type == FileTypes.PARQUET:
+        df = pd.read_parquet(file)
     else:
         raise NotImplementedError()
+    return df.infer_objects()
+def guess_file_type(path: epath.Path) -> FileType | None:
+    mime = magic.from_file(path, mime=True)
+    return ENCODING_FORMATS.get(mime)
+def file_from_url(url: str, names: set[str], folder: epath.Path) -> FileObject:
     """Downloads locally and extracts the file information."""
     file_path = hash_file_path(url)
     if not file_path.exists():
         download_file(url, file_path)
     with file_path.open("rb") as file:
         sha256 = _sha256(file.read())
+    file_type = guess_file_type(file_path)
+    df = get_dataframe(file_type, file_path)
     return FileObject(
         name=find_unique_name(names, url.split("/")[-1]),
         description="",
 def file_from_upload(
+    file: io.BytesIO, names: set[str], folder: epath.Path
 ) -> FileObject:
     """Uploads locally and extracts the file information."""
     value = file.getvalue()
     content_url = f"data/{file.name}"
     sha256 = _sha256(value)
+    file_path = get_resource_path(content_url)
+    with file_path.open("wb") as f:
         f.write(value)
+    file_type = guess_file_type(file_path)
+    df = get_dataframe(file_type, file)
     return FileObject(
         name=find_unique_name(names, file.name),
         description="",
         return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
     else:
         raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
+def is_url(file: FileObject) -> bool:
+    return file.content_url and file.content_url.startswith("http")
+def trigger_download(file: FileObject):
+    if is_url(file):
+        file_path = hash_file_path(file.content_url)
+        if not file_path.exists():
+            download_file(file.content_url, file_path)
+    else:
+        file_path = get_resource_path(file.content_url)
+    file_type = guess_file_type(file_path)
+    df = get_dataframe(file_type, file_path)
+    file.df = df

core/files_test.py CHANGED Viewed

@@ -1,12 +1,17 @@
 from etils import epath
 import pandas as pd
 import pytest
-from .files import file_from_url
-from .files import FileTypes
-def test_check_file_csv():
     csv = epath.Path(
         # This is the hash path for "https://my.url".
         "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
@@ -18,10 +23,14 @@ def test_check_file_csv():
         f.write("a,1\n")
         f.write("b,2\n")
         f.write("c,3\n")
-    file = file_from_url(FileTypes.CSV, "https://my.url", set(), epath.Path())
     pd.testing.assert_frame_equal(
         file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
     )
-    # Fails with unknown encoding_format:
     with pytest.raises(NotImplementedError):
-        file_from_url("unknown", "https://my.url", set(), epath.Path())

+from unittest import mock
 from etils import epath
 import pandas as pd
 import pytest
+from core import files as files_module
+FileTypes = files_module.FileTypes
+@mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
+def test_check_file_csv(guess_file_type):
+    del guess_file_type
     csv = epath.Path(
         # This is the hash path for "https://my.url".
         "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
         f.write("a,1\n")
         f.write("b,2\n")
         f.write("c,3\n")
+    file = files_module.file_from_url("https://my.url", set(), epath.Path())
     pd.testing.assert_frame_equal(
         file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
     )
+@mock.patch.object(files_module, "guess_file_type", return_value="unknown")
+def test_check_file_unknown(guess_file_type):
+    del guess_file_type
     with pytest.raises(NotImplementedError):
+        files_module.file_from_url("https://my.url", set(), epath.Path())

core/past_projects.py CHANGED Viewed

@@ -7,6 +7,7 @@ import streamlit as st
 from core.constants import PAST_PROJECTS_PATH
 from core.query_params import set_project
 from core.state import CurrentProject
 from core.state import get_cached_user
 from core.state import Metadata
@@ -23,13 +24,17 @@ def _pickle_file(path: epath.Path) -> epath.Path:
 def save_current_project():
-    metadata = st.session_state[Metadata]
     project = st.session_state.get(CurrentProject)
     if not project:
         project = CurrentProject.create_new()
         st.session_state[CurrentProject] = project
     project.path.mkdir(parents=True, exist_ok=True)
     set_project(project)
     try:
         pickled = pickle.dumps(metadata)
         _pickle_file(project.path).write_bytes(pickled)

 from core.constants import PAST_PROJECTS_PATH
 from core.query_params import set_project
 from core.state import CurrentProject
+from core.state import FileObject
 from core.state import get_cached_user
 from core.state import Metadata
 def save_current_project():
+    metadata: Metadata = st.session_state[Metadata]
     project = st.session_state.get(CurrentProject)
     if not project:
         project = CurrentProject.create_new()
         st.session_state[CurrentProject] = project
     project.path.mkdir(parents=True, exist_ok=True)
     set_project(project)
+    # FileObjects should have a folder.
+    for resource in metadata.distribution:
+        if isinstance(resource, FileObject):
+            resource.folder = project.path
     try:
         pickled = pickle.dumps(metadata)
         _pickle_file(project.path).write_bytes(pickled)

requirements.txt CHANGED Viewed

@@ -3,6 +3,7 @@ mlcroissant
 numpy
 pandas
 pytest
 rdflib
 requests
 streamlit

 numpy
 pandas
 pytest
+python-magic
 rdflib
 requests
 streamlit

views/files.py CHANGED Viewed

@@ -11,7 +11,9 @@ from core.files import file_from_url
 from core.files import FILE_OBJECT
 from core.files import FILE_SET
 from core.files import FILE_TYPES
 from core.files import RESOURCE_TYPES
 from core.path import get_resource_path
 from core.record_sets import infer_record_sets
 from core.state import CurrentProject
@@ -55,19 +57,21 @@ def _render_warnings():
     metadata: Metadata = st.session_state[Metadata]
     warning = ""
     for resource in metadata.distribution:
         content_url = resource.content_url
         if content_url and not content_url.startswith("http"):
             path = get_resource_path(content_url)
             if not path.exists():
                 if OAUTH_CLIENT_ID:
                     warning += (
-                        f'⚠️ Resource "{resource.name}" points to a local file, but'
                         " doesn't exist on the disk. Fix this by changing the content"
                         " URL.\n\n"
                     )
                 else:
                     warning += (
-                        f'⚠️ Resource "{resource.name}" points to a local file, but'
                         " doesn't exist on the disk. Fix this by either downloading"
                         f" it to {path} or changing the content URL.\n\n"
                     )
@@ -107,7 +111,6 @@ def _render_resources_panel(files: list[Resource]) -> Resource | None:
 def _render_upload_panel():
     """Renders the form to upload from local or upload from URL."""
     with st.form(key="upload_form", clear_on_submit=True):
-        file_type_name = st.selectbox("Encoding format", options=FILE_TYPES.keys())
         tab1, tab2, tab3 = st.tabs([
             "Import from a local file", "Import from a URL", "Add manually"
         ])
@@ -124,15 +127,14 @@ def _render_upload_panel():
         def handle_on_click():
             url = st.session_state[_DISTANT_URL_KEY]
             uploaded_file = st.session_state[_LOCAL_FILE_KEY]
-            file_type = FILE_TYPES[file_type_name]
             metadata: Metadata = st.session_state[Metadata]
             names = metadata.names()
             project: CurrentProject = st.session_state[CurrentProject]
             folder = project.path
             if url:
-                file = file_from_url(file_type, url, names, folder)
             elif uploaded_file:
-                file = file_from_upload(file_type, uploaded_file, names, folder)
             else:
                 resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
                 file = file_from_form(resource_type, names, folder)
@@ -191,7 +193,7 @@ def _render_resource_details(selected_file: Resource):
             )
-def _render_resource(prefix: int, file: FileObject | FileSet, is_file_object: bool):
     parent_options = [f.name for f in st.session_state[Metadata].distribution]
     key = f"{prefix}_parents"
     st.multiselect(
@@ -264,10 +266,7 @@ def _render_resource(prefix: int, file: FileObject | FileSet, is_file_object: bo
     )
     if is_file_object:
         st.markdown("First rows of data:")
-        is_url = file.content_url and file.content_url.startswith("http")
         if file.df is not None:
             st.dataframe(file.df, height=DF_HEIGHT)
-        elif is_url:
-            st.button("Trigger download")
         else:
-            st.markdown("No rendering possible.")

 from core.files import FILE_OBJECT
 from core.files import FILE_SET
 from core.files import FILE_TYPES
+from core.files import is_url
 from core.files import RESOURCE_TYPES
+from core.files import trigger_download
 from core.path import get_resource_path
 from core.record_sets import infer_record_sets
 from core.state import CurrentProject
     metadata: Metadata = st.session_state[Metadata]
     warning = ""
     for resource in metadata.distribution:
+        if not isinstance(resource, FileObject):
+            continue
         content_url = resource.content_url
         if content_url and not content_url.startswith("http"):
             path = get_resource_path(content_url)
             if not path.exists():
                 if OAUTH_CLIENT_ID:
                     warning += (
+                        f'⚠️ Resource "{resource.name}" points to a local file that'
                         " doesn't exist on the disk. Fix this by changing the content"
                         " URL.\n\n"
                     )
                 else:
                     warning += (
+                        f'⚠️ Resource "{resource.name}" points to a local file that'
                         " doesn't exist on the disk. Fix this by either downloading"
                         f" it to {path} or changing the content URL.\n\n"
                     )
 def _render_upload_panel():
     """Renders the form to upload from local or upload from URL."""
     with st.form(key="upload_form", clear_on_submit=True):
         tab1, tab2, tab3 = st.tabs([
             "Import from a local file", "Import from a URL", "Add manually"
         ])
         def handle_on_click():
             url = st.session_state[_DISTANT_URL_KEY]
             uploaded_file = st.session_state[_LOCAL_FILE_KEY]
             metadata: Metadata = st.session_state[Metadata]
             names = metadata.names()
             project: CurrentProject = st.session_state[CurrentProject]
             folder = project.path
             if url:
+                file = file_from_url(url, names, folder)
             elif uploaded_file:
+                file = file_from_upload(uploaded_file, names, folder)
             else:
                 resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
                 file = file_from_form(resource_type, names, folder)
             )
+def _render_resource(prefix: int, file: Resource, is_file_object: bool):
     parent_options = [f.name for f in st.session_state[Metadata].distribution]
     key = f"{prefix}_parents"
     st.multiselect(
     )
     if is_file_object:
         st.markdown("First rows of data:")
         if file.df is not None:
             st.dataframe(file.df, height=DF_HEIGHT)
         else:
+            st.button("Trigger download", on_click=trigger_download, args=(file,))

views/record_sets.py CHANGED Viewed

@@ -30,6 +30,8 @@ from views.source import render_source
 _NUM_RECORDS = 3
 _TIMEOUT_SECONDS = 1
 class _Result(TypedDict):
@@ -214,6 +216,7 @@ class FieldDataFrame:
 def render_record_sets():
     col1, col2 = st.columns([1, 1])
     with col1:
         with st.spinner("Generating the dataset..."):
@@ -361,7 +364,7 @@ def _render_left_panel():
                     left.button(
                         "⚠️",
                         key=f"idea-{prefix}",
-                        disabled=True,
                         help=textwrap.dedent(f"""**Error**:
 ```
 {exception}

 _NUM_RECORDS = 3
 _TIMEOUT_SECONDS = 1
+_INFO = """RecordSets describe sets of structured records obtained from resources or
+other RecordSets. You can think of RecordSets as tables with typed fields."""
 class _Result(TypedDict):
 def render_record_sets():
+    st.info(_INFO, icon="💡")
     col1, col2 = st.columns([1, 1])
     with col1:
         with st.spinner("Generating the dataset..."):
                     left.button(
                         "⚠️",
                         key=f"idea-{prefix}",
+                        on_click=lambda: _generate_data_with_timeout.clear(),
                         help=textwrap.dedent(f"""**Error**:
 ```
 {exception}

views/splash.py CHANGED Viewed

@@ -5,6 +5,7 @@ import streamlit as st
 from core.constants import OAUTH_CLIENT_ID
 from core.past_projects import save_current_project
 from core.query_params import set_project
 from core.state import CurrentProject
 from core.state import Metadata
@@ -12,6 +13,16 @@ import mlcroissant as mlc
 from views.load import render_load
 from views.previous_files import render_previous_files
 def render_splash():
     if OAUTH_CLIENT_ID:
@@ -39,12 +50,19 @@ def render_splash():
         with st.expander("**Try out an example!**", expanded=True):
             def create_example(dataset: str):
-                url = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}/metadata.json"
                 try:
                     json = requests.get(url).json()
                     metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
                     st.session_state[Metadata] = Metadata.from_canonical(metadata)
                     save_current_project()
                 except Exception as exception:
                     logging.error(exception)
                     st.error(
@@ -55,15 +73,7 @@ def render_splash():
             dataset = st.selectbox(
                 label="Dataset",
-                options=[
-                    "Titanic",
-                    "FLORES-200",
-                    "GPT-3",
-                    "COCO2014",
-                    "PASS",
-                    "MovieLens",
-                    "Bigcode-The-Stack",
-                ],
             )
             st.button(
                 f"{dataset} dataset",

 from core.constants import OAUTH_CLIENT_ID
 from core.past_projects import save_current_project
+from core.path import get_resource_path
 from core.query_params import set_project
 from core.state import CurrentProject
 from core.state import Metadata
 from views.load import render_load
 from views.previous_files import render_previous_files
+_DATASETS = {
+    "Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
+    "FLORES-200": [],
+    "GPT-3": [],
+    "COCO2014": [],
+    "PASS": [],
+    "MovieLens": [],
+    "Bigcode-The-Stack": [],
+}
 def render_splash():
     if OAUTH_CLIENT_ID:
         with st.expander("**Try out an example!**", expanded=True):
             def create_example(dataset: str):
+                base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
+                url = f"{base}/metadata.json"
                 try:
                     json = requests.get(url).json()
                     metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
                     st.session_state[Metadata] = Metadata.from_canonical(metadata)
                     save_current_project()
+                    # Write supplementary files.
+                    files = _DATASETS.get(dataset, [])
+                    for file in files:
+                        path = get_resource_path(file)
+                        json = requests.get(f"{base}/{file}")
+                        path.write_bytes(json.content)
                 except Exception as exception:
                     logging.error(exception)
                     st.error(
             dataset = st.selectbox(
                 label="Dataset",
+                options=_DATASETS.keys(),
             )
             st.button(
                 f"{dataset} dataset",