croissant-editor / core /record_sets.py
marcenacp's picture
Deploy (see actual commits on https://github.com/mlcommons/croissant).
73ebcab
raw
history blame contribute delete
No virus
1.25 kB
from core.data_types import convert_dtype
from core.names import find_unique_name
from core.state import Field
from core.state import FileObject
from core.state import FileSet
from core.state import RecordSet
import mlcroissant as mlc
def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[RecordSet]:
"""Infers one or several ml:RecordSets from a FileOject/FileSet."""
# For the moment, there is no inference support for FileSets.
if isinstance(file, FileSet):
return []
# We can infer only if the underlying `pd.DataFrame` could be built.
if file.df is None:
return []
fields = []
for column, value in file.df.dtypes.items():
source = mlc.Source(
distribution=file.id,
extract=mlc.Extract(column=column),
)
field = Field(
id=column,
name=column,
data_types=[convert_dtype(value)],
source=source,
references=mlc.Source(),
)
fields.append(field)
name = find_unique_name(names, file.name + "_record_set")
return [
RecordSet(
id=name,
fields=fields,
name=name,
description="",
)
]