marcenacp commited on
Commit
fe3ba5f
1 Parent(s): e92e659

Deploy (see actual commits on https://github.com/mlcommons/croissant).

Browse files
Dockerfile CHANGED
@@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \
10
  software-properties-common \
11
  git \
12
  python3-pip \
 
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
  COPY ./ /app/
 
10
  software-properties-common \
11
  git \
12
  python3-pip \
13
+ libmagic1 \
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
  COPY ./ /app/
core/files.py CHANGED
@@ -4,6 +4,7 @@ import io
4
  import tempfile
5
 
6
  from etils import epath
 
7
  import pandas as pd
8
  import requests
9
 
@@ -83,6 +84,10 @@ FILE_TYPES: dict[str, FileType] = {
83
  ]
84
  }
85
 
 
 
 
 
86
 
87
  def name_to_code(file_type_name: str) -> str | None:
88
  """Maps names to the encoding format: Text => plain/text."""
@@ -127,29 +132,34 @@ def download_file(url: str, file_path: epath.Path):
127
  def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
128
  """Gets the df associated to the file."""
129
  if file_type == FileTypes.CSV:
130
- return pd.read_csv(file)
131
  elif file_type == FileTypes.EXCEL:
132
- return pd.read_excel(file)
133
  elif file_type == FileTypes.JSON:
134
- return pd.read_json(file)
135
  elif file_type == FileTypes.JSONL:
136
- return pd.read_json(file, lines=True)
137
  elif file_type == FileTypes.PARQUET:
138
- return pd.read_parquet(file)
139
  else:
140
  raise NotImplementedError()
 
141
 
142
 
143
- def file_from_url(
144
- file_type: FileType, url: str, names: set[str], folder: epath.Path
145
- ) -> FileObject:
 
 
 
146
  """Downloads locally and extracts the file information."""
147
  file_path = hash_file_path(url)
148
  if not file_path.exists():
149
  download_file(url, file_path)
150
  with file_path.open("rb") as file:
151
  sha256 = _sha256(file.read())
152
- df = get_dataframe(file_type, file_path).infer_objects()
 
153
  return FileObject(
154
  name=find_unique_name(names, url.split("/")[-1]),
155
  description="",
@@ -162,15 +172,17 @@ def file_from_url(
162
 
163
 
164
  def file_from_upload(
165
- file_type: FileType, file: io.BytesIO, names: set[str], folder: epath.Path
166
  ) -> FileObject:
167
  """Uploads locally and extracts the file information."""
168
  value = file.getvalue()
169
  content_url = f"data/{file.name}"
170
  sha256 = _sha256(value)
171
- with get_resource_path(content_url).open("wb") as f:
 
172
  f.write(value)
173
- df = get_dataframe(file_type, file).infer_objects()
 
174
  return FileObject(
175
  name=find_unique_name(names, file.name),
176
  description="",
@@ -192,3 +204,19 @@ def file_from_form(
192
  return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
193
  else:
194
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import tempfile
5
 
6
  from etils import epath
7
+ import magic
8
  import pandas as pd
9
  import requests
10
 
 
84
  ]
85
  }
86
 
87
+ ENCODING_FORMATS: dict[str, FileType] = {
88
+ file_type.encoding_format: file_type for file_type in FILE_TYPES.values()
89
+ }
90
+
91
 
92
  def name_to_code(file_type_name: str) -> str | None:
93
  """Maps names to the encoding format: Text => plain/text."""
 
132
  def get_dataframe(file_type: FileType, file: io.BytesIO | epath.Path) -> pd.DataFrame:
133
  """Gets the df associated to the file."""
134
  if file_type == FileTypes.CSV:
135
+ df = pd.read_csv(file)
136
  elif file_type == FileTypes.EXCEL:
137
+ df = pd.read_excel(file)
138
  elif file_type == FileTypes.JSON:
139
+ df = pd.read_json(file)
140
  elif file_type == FileTypes.JSONL:
141
+ df = pd.read_json(file, lines=True)
142
  elif file_type == FileTypes.PARQUET:
143
+ df = pd.read_parquet(file)
144
  else:
145
  raise NotImplementedError()
146
+ return df.infer_objects()
147
 
148
 
149
+ def guess_file_type(path: epath.Path) -> FileType | None:
150
+ mime = magic.from_file(path, mime=True)
151
+ return ENCODING_FORMATS.get(mime)
152
+
153
+
154
+ def file_from_url(url: str, names: set[str], folder: epath.Path) -> FileObject:
155
  """Downloads locally and extracts the file information."""
156
  file_path = hash_file_path(url)
157
  if not file_path.exists():
158
  download_file(url, file_path)
159
  with file_path.open("rb") as file:
160
  sha256 = _sha256(file.read())
161
+ file_type = guess_file_type(file_path)
162
+ df = get_dataframe(file_type, file_path)
163
  return FileObject(
164
  name=find_unique_name(names, url.split("/")[-1]),
165
  description="",
 
172
 
173
 
174
  def file_from_upload(
175
+ file: io.BytesIO, names: set[str], folder: epath.Path
176
  ) -> FileObject:
177
  """Uploads locally and extracts the file information."""
178
  value = file.getvalue()
179
  content_url = f"data/{file.name}"
180
  sha256 = _sha256(value)
181
+ file_path = get_resource_path(content_url)
182
+ with file_path.open("wb") as f:
183
  f.write(value)
184
+ file_type = guess_file_type(file_path)
185
+ df = get_dataframe(file_type, file)
186
  return FileObject(
187
  name=find_unique_name(names, file.name),
188
  description="",
 
204
  return FileSet(name=find_unique_name(names, "file_set"), folder=folder)
205
  else:
206
  raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")
207
+
208
+
209
+ def is_url(file: FileObject) -> bool:
210
+ return file.content_url and file.content_url.startswith("http")
211
+
212
+
213
+ def trigger_download(file: FileObject):
214
+ if is_url(file):
215
+ file_path = hash_file_path(file.content_url)
216
+ if not file_path.exists():
217
+ download_file(file.content_url, file_path)
218
+ else:
219
+ file_path = get_resource_path(file.content_url)
220
+ file_type = guess_file_type(file_path)
221
+ df = get_dataframe(file_type, file_path)
222
+ file.df = df
core/files_test.py CHANGED
@@ -1,12 +1,17 @@
 
 
1
  from etils import epath
2
  import pandas as pd
3
  import pytest
4
 
5
- from .files import file_from_url
6
- from .files import FileTypes
 
7
 
8
 
9
- def test_check_file_csv():
 
 
10
  csv = epath.Path(
11
  # This is the hash path for "https://my.url".
12
  "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
@@ -18,10 +23,14 @@ def test_check_file_csv():
18
  f.write("a,1\n")
19
  f.write("b,2\n")
20
  f.write("c,3\n")
21
- file = file_from_url(FileTypes.CSV, "https://my.url", set(), epath.Path())
22
  pd.testing.assert_frame_equal(
23
  file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
24
  )
25
- # Fails with unknown encoding_format:
 
 
 
 
26
  with pytest.raises(NotImplementedError):
27
- file_from_url("unknown", "https://my.url", set(), epath.Path())
 
1
+ from unittest import mock
2
+
3
  from etils import epath
4
  import pandas as pd
5
  import pytest
6
 
7
+ from core import files as files_module
8
+
9
+ FileTypes = files_module.FileTypes
10
 
11
 
12
+ @mock.patch.object(files_module, "guess_file_type", return_value=FileTypes.CSV)
13
+ def test_check_file_csv(guess_file_type):
14
+ del guess_file_type
15
  csv = epath.Path(
16
  # This is the hash path for "https://my.url".
17
  "/tmp/croissant-editor-f76b4732c82d83daf858fae2cc0e590d352a4bceb781351243a03daab11f76bc"
 
23
  f.write("a,1\n")
24
  f.write("b,2\n")
25
  f.write("c,3\n")
26
+ file = files_module.file_from_url("https://my.url", set(), epath.Path())
27
  pd.testing.assert_frame_equal(
28
  file.df, pd.DataFrame({"column1": ["a", "b", "c"], "column2": [1, 2, 3]})
29
  )
30
+
31
+
32
+ @mock.patch.object(files_module, "guess_file_type", return_value="unknown")
33
+ def test_check_file_unknown(guess_file_type):
34
+ del guess_file_type
35
  with pytest.raises(NotImplementedError):
36
+ files_module.file_from_url("https://my.url", set(), epath.Path())
core/past_projects.py CHANGED
@@ -7,6 +7,7 @@ import streamlit as st
7
  from core.constants import PAST_PROJECTS_PATH
8
  from core.query_params import set_project
9
  from core.state import CurrentProject
 
10
  from core.state import get_cached_user
11
  from core.state import Metadata
12
 
@@ -23,13 +24,17 @@ def _pickle_file(path: epath.Path) -> epath.Path:
23
 
24
 
25
  def save_current_project():
26
- metadata = st.session_state[Metadata]
27
  project = st.session_state.get(CurrentProject)
28
  if not project:
29
  project = CurrentProject.create_new()
30
  st.session_state[CurrentProject] = project
31
  project.path.mkdir(parents=True, exist_ok=True)
32
  set_project(project)
 
 
 
 
33
  try:
34
  pickled = pickle.dumps(metadata)
35
  _pickle_file(project.path).write_bytes(pickled)
 
7
  from core.constants import PAST_PROJECTS_PATH
8
  from core.query_params import set_project
9
  from core.state import CurrentProject
10
+ from core.state import FileObject
11
  from core.state import get_cached_user
12
  from core.state import Metadata
13
 
 
24
 
25
 
26
  def save_current_project():
27
+ metadata: Metadata = st.session_state[Metadata]
28
  project = st.session_state.get(CurrentProject)
29
  if not project:
30
  project = CurrentProject.create_new()
31
  st.session_state[CurrentProject] = project
32
  project.path.mkdir(parents=True, exist_ok=True)
33
  set_project(project)
34
+ # FileObjects should have a folder.
35
+ for resource in metadata.distribution:
36
+ if isinstance(resource, FileObject):
37
+ resource.folder = project.path
38
  try:
39
  pickled = pickle.dumps(metadata)
40
  _pickle_file(project.path).write_bytes(pickled)
requirements.txt CHANGED
@@ -3,6 +3,7 @@ mlcroissant
3
  numpy
4
  pandas
5
  pytest
 
6
  rdflib
7
  requests
8
  streamlit
 
3
  numpy
4
  pandas
5
  pytest
6
+ python-magic
7
  rdflib
8
  requests
9
  streamlit
views/files.py CHANGED
@@ -11,7 +11,9 @@ from core.files import file_from_url
11
  from core.files import FILE_OBJECT
12
  from core.files import FILE_SET
13
  from core.files import FILE_TYPES
 
14
  from core.files import RESOURCE_TYPES
 
15
  from core.path import get_resource_path
16
  from core.record_sets import infer_record_sets
17
  from core.state import CurrentProject
@@ -55,19 +57,21 @@ def _render_warnings():
55
  metadata: Metadata = st.session_state[Metadata]
56
  warning = ""
57
  for resource in metadata.distribution:
 
 
58
  content_url = resource.content_url
59
  if content_url and not content_url.startswith("http"):
60
  path = get_resource_path(content_url)
61
  if not path.exists():
62
  if OAUTH_CLIENT_ID:
63
  warning += (
64
- f'⚠️ Resource "{resource.name}" points to a local file, but'
65
  " doesn't exist on the disk. Fix this by changing the content"
66
  " URL.\n\n"
67
  )
68
  else:
69
  warning += (
70
- f'⚠️ Resource "{resource.name}" points to a local file, but'
71
  " doesn't exist on the disk. Fix this by either downloading"
72
  f" it to {path} or changing the content URL.\n\n"
73
  )
@@ -107,7 +111,6 @@ def _render_resources_panel(files: list[Resource]) -> Resource | None:
107
  def _render_upload_panel():
108
  """Renders the form to upload from local or upload from URL."""
109
  with st.form(key="upload_form", clear_on_submit=True):
110
- file_type_name = st.selectbox("Encoding format", options=FILE_TYPES.keys())
111
  tab1, tab2, tab3 = st.tabs([
112
  "Import from a local file", "Import from a URL", "Add manually"
113
  ])
@@ -124,15 +127,14 @@ def _render_upload_panel():
124
  def handle_on_click():
125
  url = st.session_state[_DISTANT_URL_KEY]
126
  uploaded_file = st.session_state[_LOCAL_FILE_KEY]
127
- file_type = FILE_TYPES[file_type_name]
128
  metadata: Metadata = st.session_state[Metadata]
129
  names = metadata.names()
130
  project: CurrentProject = st.session_state[CurrentProject]
131
  folder = project.path
132
  if url:
133
- file = file_from_url(file_type, url, names, folder)
134
  elif uploaded_file:
135
- file = file_from_upload(file_type, uploaded_file, names, folder)
136
  else:
137
  resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
138
  file = file_from_form(resource_type, names, folder)
@@ -191,7 +193,7 @@ def _render_resource_details(selected_file: Resource):
191
  )
192
 
193
 
194
- def _render_resource(prefix: int, file: FileObject | FileSet, is_file_object: bool):
195
  parent_options = [f.name for f in st.session_state[Metadata].distribution]
196
  key = f"{prefix}_parents"
197
  st.multiselect(
@@ -264,10 +266,7 @@ def _render_resource(prefix: int, file: FileObject | FileSet, is_file_object: bo
264
  )
265
  if is_file_object:
266
  st.markdown("First rows of data:")
267
- is_url = file.content_url and file.content_url.startswith("http")
268
  if file.df is not None:
269
  st.dataframe(file.df, height=DF_HEIGHT)
270
- elif is_url:
271
- st.button("Trigger download")
272
  else:
273
- st.markdown("No rendering possible.")
 
11
  from core.files import FILE_OBJECT
12
  from core.files import FILE_SET
13
  from core.files import FILE_TYPES
14
+ from core.files import is_url
15
  from core.files import RESOURCE_TYPES
16
+ from core.files import trigger_download
17
  from core.path import get_resource_path
18
  from core.record_sets import infer_record_sets
19
  from core.state import CurrentProject
 
57
  metadata: Metadata = st.session_state[Metadata]
58
  warning = ""
59
  for resource in metadata.distribution:
60
+ if not isinstance(resource, FileObject):
61
+ continue
62
  content_url = resource.content_url
63
  if content_url and not content_url.startswith("http"):
64
  path = get_resource_path(content_url)
65
  if not path.exists():
66
  if OAUTH_CLIENT_ID:
67
  warning += (
68
+ f'⚠️ Resource "{resource.name}" points to a local file that'
69
  " doesn't exist on the disk. Fix this by changing the content"
70
  " URL.\n\n"
71
  )
72
  else:
73
  warning += (
74
+ f'⚠️ Resource "{resource.name}" points to a local file that'
75
  " doesn't exist on the disk. Fix this by either downloading"
76
  f" it to {path} or changing the content URL.\n\n"
77
  )
 
111
  def _render_upload_panel():
112
  """Renders the form to upload from local or upload from URL."""
113
  with st.form(key="upload_form", clear_on_submit=True):
 
114
  tab1, tab2, tab3 = st.tabs([
115
  "Import from a local file", "Import from a URL", "Add manually"
116
  ])
 
127
  def handle_on_click():
128
  url = st.session_state[_DISTANT_URL_KEY]
129
  uploaded_file = st.session_state[_LOCAL_FILE_KEY]
 
130
  metadata: Metadata = st.session_state[Metadata]
131
  names = metadata.names()
132
  project: CurrentProject = st.session_state[CurrentProject]
133
  folder = project.path
134
  if url:
135
+ file = file_from_url(url, names, folder)
136
  elif uploaded_file:
137
+ file = file_from_upload(uploaded_file, names, folder)
138
  else:
139
  resource_type = st.session_state[_MANUAL_RESOURCE_TYPE_KEY]
140
  file = file_from_form(resource_type, names, folder)
 
193
  )
194
 
195
 
196
+ def _render_resource(prefix: int, file: Resource, is_file_object: bool):
197
  parent_options = [f.name for f in st.session_state[Metadata].distribution]
198
  key = f"{prefix}_parents"
199
  st.multiselect(
 
266
  )
267
  if is_file_object:
268
  st.markdown("First rows of data:")
 
269
  if file.df is not None:
270
  st.dataframe(file.df, height=DF_HEIGHT)
 
 
271
  else:
272
+ st.button("Trigger download", on_click=trigger_download, args=(file,))
views/record_sets.py CHANGED
@@ -30,6 +30,8 @@ from views.source import render_source
30
 
31
  _NUM_RECORDS = 3
32
  _TIMEOUT_SECONDS = 1
 
 
33
 
34
 
35
  class _Result(TypedDict):
@@ -214,6 +216,7 @@ class FieldDataFrame:
214
 
215
 
216
  def render_record_sets():
 
217
  col1, col2 = st.columns([1, 1])
218
  with col1:
219
  with st.spinner("Generating the dataset..."):
@@ -361,7 +364,7 @@ def _render_left_panel():
361
  left.button(
362
  "⚠️",
363
  key=f"idea-{prefix}",
364
- disabled=True,
365
  help=textwrap.dedent(f"""**Error**:
366
  ```
367
  {exception}
 
30
 
31
  _NUM_RECORDS = 3
32
  _TIMEOUT_SECONDS = 1
33
+ _INFO = """RecordSets describe sets of structured records obtained from resources or
34
+ other RecordSets. You can think of RecordSets as tables with typed fields."""
35
 
36
 
37
  class _Result(TypedDict):
 
216
 
217
 
218
  def render_record_sets():
219
+ st.info(_INFO, icon="💡")
220
  col1, col2 = st.columns([1, 1])
221
  with col1:
222
  with st.spinner("Generating the dataset..."):
 
364
  left.button(
365
  "⚠️",
366
  key=f"idea-{prefix}",
367
+ on_click=lambda: _generate_data_with_timeout.clear(),
368
  help=textwrap.dedent(f"""**Error**:
369
  ```
370
  {exception}
views/splash.py CHANGED
@@ -5,6 +5,7 @@ import streamlit as st
5
 
6
  from core.constants import OAUTH_CLIENT_ID
7
  from core.past_projects import save_current_project
 
8
  from core.query_params import set_project
9
  from core.state import CurrentProject
10
  from core.state import Metadata
@@ -12,6 +13,16 @@ import mlcroissant as mlc
12
  from views.load import render_load
13
  from views.previous_files import render_previous_files
14
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def render_splash():
17
  if OAUTH_CLIENT_ID:
@@ -39,12 +50,19 @@ def render_splash():
39
  with st.expander("**Try out an example!**", expanded=True):
40
 
41
  def create_example(dataset: str):
42
- url = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}/metadata.json"
 
43
  try:
44
  json = requests.get(url).json()
45
  metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
46
  st.session_state[Metadata] = Metadata.from_canonical(metadata)
47
  save_current_project()
 
 
 
 
 
 
48
  except Exception as exception:
49
  logging.error(exception)
50
  st.error(
@@ -55,15 +73,7 @@ def render_splash():
55
 
56
  dataset = st.selectbox(
57
  label="Dataset",
58
- options=[
59
- "Titanic",
60
- "FLORES-200",
61
- "GPT-3",
62
- "COCO2014",
63
- "PASS",
64
- "MovieLens",
65
- "Bigcode-The-Stack",
66
- ],
67
  )
68
  st.button(
69
  f"{dataset} dataset",
 
5
 
6
  from core.constants import OAUTH_CLIENT_ID
7
  from core.past_projects import save_current_project
8
+ from core.path import get_resource_path
9
  from core.query_params import set_project
10
  from core.state import CurrentProject
11
  from core.state import Metadata
 
13
  from views.load import render_load
14
  from views.previous_files import render_previous_files
15
 
16
+ _DATASETS = {
17
+ "Titanic": ["data/embarkation_ports.csv", "data/genders.csv"],
18
+ "FLORES-200": [],
19
+ "GPT-3": [],
20
+ "COCO2014": [],
21
+ "PASS": [],
22
+ "MovieLens": [],
23
+ "Bigcode-The-Stack": [],
24
+ }
25
+
26
 
27
  def render_splash():
28
  if OAUTH_CLIENT_ID:
 
50
  with st.expander("**Try out an example!**", expanded=True):
51
 
52
  def create_example(dataset: str):
53
+ base = f"https://raw.githubusercontent.com/mlcommons/croissant/main/datasets/{dataset.lower()}"
54
+ url = f"{base}/metadata.json"
55
  try:
56
  json = requests.get(url).json()
57
  metadata = mlc.Metadata.from_json(mlc.Issues(), json, None)
58
  st.session_state[Metadata] = Metadata.from_canonical(metadata)
59
  save_current_project()
60
+ # Write supplementary files.
61
+ files = _DATASETS.get(dataset, [])
62
+ for file in files:
63
+ path = get_resource_path(file)
64
+ json = requests.get(f"{base}/{file}")
65
+ path.write_bytes(json.content)
66
  except Exception as exception:
67
  logging.error(exception)
68
  st.error(
 
73
 
74
  dataset = st.selectbox(
75
  label="Dataset",
76
+ options=_DATASETS.keys(),
 
 
 
 
 
 
 
 
77
  )
78
  st.button(
79
  f"{dataset} dataset",