|
""" |
|
This script is used to curate the data for the project. |
|
|
|
Implement your functions to to clean the data and prepare it for model training. |
|
|
|
Note: the competition requires that you use FiftyOne for data curation and you are only allowed to |
|
use the approaved dataset from the hub, Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set, which can |
|
be found here: https://huggingface.co/datasets/Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set |
|
""" |
|
|
|
import fiftyone as fo |
|
import fiftyone.utils.huggingface as fouh |
|
|
|
|
|
|
|
def shuffle_data(dataset): |
|
"""Shuffle the dataset""" |
|
return dataset.shuffle(seed=51) |
|
|
|
def take_random_sample(dataset): |
|
"""Take a sample from the dataset""" |
|
return dataset.take(size=10,seed=51) |
|
|
|
def prepare_dataset(name): |
|
""" |
|
Prepare the dataset for model training. |
|
|
|
Args: |
|
name (str): The name of the dataset to load. Must be "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set". |
|
|
|
Returns: |
|
fiftyone.core.dataset.Dataset: The curated dataset. |
|
|
|
Raises: |
|
ValueError: If the provided dataset name is not the approved one. |
|
|
|
Note: |
|
The following code block MUST NOT be removed from your submission: |
|
|
|
APPROVED_DATASET = "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set" |
|
|
|
if name != APPROVED_DATASET: |
|
raise ValueError(f"Only the approved dataset '{APPROVED_DATASET}' is allowed for this competition.") |
|
|
|
This ensures that only the approved dataset is used for the competition. |
|
""" |
|
APPROVED_DATASET = "Voxel51/Data-Centric-Visual-AI-Challenge-Train-Set" |
|
Vox |
|
if name != APPROVED_DATASET: |
|
raise ValueError(f"Only the approved dataset '{APPROVED_DATASET}' is allowed for this competition.") |
|
|
|
|
|
dataset = fouh.load_from_hub(name, split="train") |
|
|
|
|
|
dataset = shuffle_data(dataset) |
|
dataset = take_random_sample(dataset) |
|
|
|
|
|
curated_dataset = dataset.clone() |
|
return curated_dataset |