Source code for datasets.info

# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
""" DatasetInfo and MetricInfo record information we know about a dataset and a metric.

This includes things that we know about the dataset statically, i.e.:
 - description
 - canonical location
 - does it have validation and tests splits
 - size
 - etc.

This also includes the things that can and should be computed once we've
processed the dataset as well:
 - number of examples (in each split)
 - etc.
"""

import copy
import dataclasses
import json
import os
from dataclasses import asdict, dataclass, field
from typing import List, Optional, Union

from datasets.tasks.text_classification import TextClassification

from . import config
from .features import Features, Value
from .splits import SplitDict
from .tasks import TaskTemplate, task_template_from_dict
from .utils import Version
from .utils.logging import get_logger


logger = get_logger(__name__)


@dataclass
class SupervisedKeysData:
    input: str = ""
    output: str = ""


@dataclass
class DownloadChecksumsEntryData:
    key: str = ""
    value: str = ""


class MissingCachedSizesConfigError(Exception):
    """The expected cached sizes of the download file are missing."""


class NonMatchingCachedSizesError(Exception):
    """The prepared split doesn't have expected sizes."""


@dataclass
class PostProcessedInfo:
    features: Optional[Features] = None
    resources_checksums: Optional[dict] = None

    def __post_init__(self):
        # Convert back to the correct classes when we reload from dict
        if self.features is not None and not isinstance(self.features, Features):
            self.features = Features.from_dict(self.features)

    @classmethod
    def from_dict(cls, post_processed_info_dict: dict) -> "PostProcessedInfo":
        field_names = set(f.name for f in dataclasses.fields(cls))
        return cls(**{k: v for k, v in post_processed_info_dict.items() if k in field_names})


[docs]@dataclass
class DatasetInfo:
    """Information about a dataset.

    `DatasetInfo` documents datasets, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Note: Not all fields are known on construction and may be updated later.

    Attributes:
        description (str): A description of the dataset.
        citation (str): A BibTeX citation of the dataset.
        homepage (str): A URL to the official homepage for the dataset.
        license (str): The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
        features (Features, optional): The features used to specify the dataset's column types.
        post_processed (PostProcessedInfo, optional): Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
        supervised_keys (SupervisedKeysData, optional): Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
        builder_name (str, optional): The name of the :class:`GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name. It is also the snake_case version of the dataset builder class name.
        config_name (str, optional): The name of the configuration derived from :class:`BuilderConfig`
        version (str or Version, optional): The version of the dataset.
        splits (dict, optional): The mapping between split name and metadata.
        download_checksums (dict, optional): The mapping between the URL to download the dataset's checksums and corresponding metadata.
        download_size (int, optional): The size of the files to download to generate the dataset, in bytes.
        post_processing_size (int, optional): Size of the dataset in bytes after post-processing, if any.
        dataset_size (int, optional): The combined size in bytes of the Arrow tables for all splits.
        size_in_bytes (int, optional): The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
        task_templates (List[TaskTemplate], optional): The task templates to prepare the dataset for during training and evaluation. Each template casts the dataset's :class:`Features` to standardized column names and types as detailed in :py:mod:`datasets.tasks`.
        **config_kwargs: Keyword arguments to be passed to the :class:`BuilderConfig` and used in the :class:`DatasetBuilder`.
    """

    # Set in the dataset scripts
    description: str = field(default_factory=str)
    citation: str = field(default_factory=str)
    homepage: str = field(default_factory=str)
    license: str = field(default_factory=str)
    features: Optional[Features] = None
    post_processed: Optional[PostProcessedInfo] = None
    supervised_keys: Optional[SupervisedKeysData] = None
    task_templates: Optional[List[TaskTemplate]] = None

    # Set later by the builder
    builder_name: Optional[str] = None
    config_name: Optional[str] = None
    version: Optional[Union[str, Version]] = None
    # Set later by `download_and_prepare`
    splits: Optional[dict] = None
    download_checksums: Optional[dict] = None
    download_size: Optional[int] = None
    post_processing_size: Optional[int] = None
    dataset_size: Optional[int] = None
    size_in_bytes: Optional[int] = None

    def __post_init__(self):
        # Convert back to the correct classes when we reload from dict
        if self.features is not None and not isinstance(self.features, Features):
            self.features = Features.from_dict(self.features)
        if self.post_processed is not None and not isinstance(self.post_processed, PostProcessedInfo):
            self.post_processed = PostProcessedInfo.from_dict(self.post_processed)
        if self.version is not None and not isinstance(self.version, Version):
            if isinstance(self.version, str):
                self.version = Version(self.version)
            else:
                self.version = Version.from_dict(self.version)
        if self.splits is not None and not isinstance(self.splits, SplitDict):
            self.splits = SplitDict.from_split_dict(self.splits)
        if self.supervised_keys is not None and not isinstance(self.supervised_keys, SupervisedKeysData):
            if isinstance(self.supervised_keys, (tuple, list)):
                self.supervised_keys = SupervisedKeysData(*self.supervised_keys)
            else:
                self.supervised_keys = SupervisedKeysData(**self.supervised_keys)

        # Parse and make a list of templates
        if self.task_templates is not None:
            if isinstance(self.task_templates, (list, tuple)):
                templates = [
                    template if isinstance(template, TaskTemplate) else task_template_from_dict(template)
                    for template in self.task_templates
                ]
                self.task_templates = [template for template in templates if template is not None]
            elif isinstance(self.task_templates, TaskTemplate):
                self.task_templates = [self.task_templates]
            else:
                template = task_template_from_dict(self.task_templates)
                self.task_templates = [template] if template is not None else []

        # Insert labels and mappings for text classification
        if self.task_templates is not None:
            self.task_templates = list(self.task_templates)
            if self.features is not None:
                for idx, template in enumerate(self.task_templates):
                    if isinstance(template, TextClassification):
                        labels = self.features[template.label_column].names
                        self.task_templates[idx] = TextClassification(
                            text_column=template.text_column, label_column=template.label_column, labels=labels
                        )

    def _license_path(self, dataset_info_dir):
        return os.path.join(dataset_info_dir, config.LICENSE_FILENAME)

[docs]    def write_to_directory(self, dataset_info_dir):
        """Write `DatasetInfo` as JSON to `dataset_info_dir`.

        Also save the license separately in LICENCE.
        """
        with open(os.path.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "wb") as f:
            self._dump_info(f)

        with open(os.path.join(dataset_info_dir, config.LICENSE_FILENAME), "wb") as f:
            self._dump_license(f)

    def _dump_info(self, file):
        """Dump info in `file` file-like object open in bytes mode (to support remote files)"""
        file.write(json.dumps(asdict(self)).encode("utf-8"))

    def _dump_license(self, file):
        """Dump license in `file` file-like object open in bytes mode (to support remote files)"""
        file.write(self.license.encode("utf-8"))

    @classmethod
    def from_merge(cls, dataset_infos: List["DatasetInfo"]):
        def unique(values):
            seen = set()
            for value in values:
                if value not in seen:
                    seen.add(value)
                    yield value

        dataset_infos = [dset_info.copy() for dset_info in dataset_infos if dset_info is not None]
        description = "\n\n".join(unique(info.description for info in dataset_infos))
        citation = "\n\n".join(unique(info.citation for info in dataset_infos))
        homepage = "\n\n".join(unique(info.homepage for info in dataset_infos))
        license = "\n\n".join(unique(info.license for info in dataset_infos))
        features = None
        supervised_keys = None
        task_templates = None

        # Find common task templates across all dataset infos
        all_task_templates = [info.task_templates for info in dataset_infos if info.task_templates is not None]
        if len(all_task_templates) > 1:
            task_templates = list(set(all_task_templates[0]).intersection(*all_task_templates[1:]))
        elif len(all_task_templates):
            task_templates = list(set(all_task_templates[0]))
        # If no common task templates found, replace empty list with None
        task_templates = task_templates if task_templates else None

        return cls(
            description=description,
            citation=citation,
            homepage=homepage,
            license=license,
            features=features,
            supervised_keys=supervised_keys,
            task_templates=task_templates,
        )

[docs]    @classmethod
    def from_directory(cls, dataset_info_dir: str) -> "DatasetInfo":
        """Create DatasetInfo from the JSON file in `dataset_info_dir`.

        This function updates all the dynamically generated fields (num_examples,
        hash, time of creation,...) of the DatasetInfo.

        This will overwrite all previous metadata.

        Args:
            dataset_info_dir (`str`): The directory containing the metadata file. This
                should be the root directory of a specific dataset version.
        """
        logger.info("Loading Dataset info from %s", dataset_info_dir)
        if not dataset_info_dir:
            raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")

        with open(os.path.join(dataset_info_dir, config.DATASET_INFO_FILENAME), "r", encoding="utf-8") as f:
            dataset_info_dict = json.load(f)
        return cls.from_dict(dataset_info_dict)

    @classmethod
    def from_dict(cls, dataset_info_dict: dict) -> "DatasetInfo":
        field_names = set(f.name for f in dataclasses.fields(cls))
        return cls(**{k: v for k, v in dataset_info_dict.items() if k in field_names})

    def update(self, other_dataset_info: "DatasetInfo", ignore_none=True):
        self_dict = self.__dict__
        self_dict.update(
            **{
                k: copy.deepcopy(v)
                for k, v in other_dataset_info.__dict__.items()
                if (v is not None or not ignore_none)
            }
        )

    def copy(self) -> "DatasetInfo":
        return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})


class DatasetInfosDict(dict):
    def write_to_directory(self, dataset_infos_dir, overwrite=False):
        total_dataset_infos = {}
        dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME)
        if os.path.exists(dataset_infos_path) and not overwrite:
            logger.info("Dataset Infos already exists in {}. Completing it with new infos.".format(dataset_infos_dir))
            total_dataset_infos = self.from_directory(dataset_infos_dir)
        else:
            logger.info("Writing new Dataset Infos in {}".format(dataset_infos_dir))
        total_dataset_infos.update(self)
        with open(dataset_infos_path, "w", encoding="utf-8") as f:
            json.dump({config_name: asdict(dset_info) for config_name, dset_info in total_dataset_infos.items()}, f)

    @classmethod
    def from_directory(cls, dataset_infos_dir):
        logger.info("Loading Dataset Infos from {}".format(dataset_infos_dir))
        with open(os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME), "r", encoding="utf-8") as f:
            dataset_infos_dict = {
                config_name: DatasetInfo.from_dict(dataset_info_dict)
                for config_name, dataset_info_dict in json.load(f).items()
            }
        return cls(**dataset_infos_dict)


[docs]@dataclass
class MetricInfo:
    """Information about a metric.

    `MetricInfo` documents a metric, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Note: Not all fields are known on construction and may be updated later.
    """

    # Set in the dataset scripts
    description: str
    citation: str
    features: Features
    inputs_description: str = field(default_factory=str)
    homepage: str = field(default_factory=str)
    license: str = field(default_factory=str)
    codebase_urls: List[str] = field(default_factory=list)
    reference_urls: List[str] = field(default_factory=list)
    streamable: bool = False
    format: Optional[str] = None

    # Set later by the builder
    metric_name: Optional[str] = None
    config_name: Optional[str] = None
    experiment_id: Optional[str] = None

    def __post_init__(self):
        assert "predictions" in self.features, "Need to have at least a 'predictions' field in 'features'."
        if self.format is not None:
            for key, value in self.features.items():
                if not isinstance(value, Value):
                    raise ValueError(
                        f"When using 'numpy' format, all features should be a `datasets.Value` feature. "
                        f"Here {key} is an instance of {value.__class__.__name__}"
                    )

[docs]    def write_to_directory(self, metric_info_dir):
        """Write `MetricInfo` as JSON to `metric_info_dir`.
        Also save the license separately in LICENCE.
        """
        with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), "w", encoding="utf-8") as f:
            json.dump(asdict(self), f)

        with open(os.path.join(metric_info_dir, config.LICENSE_FILENAME), "w", encoding="utf-8") as f:
            f.write(self.license)

[docs]    @classmethod
    def from_directory(cls, metric_info_dir) -> "MetricInfo":
        """Create MetricInfo from the JSON file in `metric_info_dir`.

        Args:
            metric_info_dir: `str` The directory containing the metadata file. This
                should be the root directory of a specific dataset version.
        """
        logger.info("Loading Metric info from %s", metric_info_dir)
        if not metric_info_dir:
            raise ValueError("Calling MetricInfo.from_directory() with undefined metric_info_dir.")

        with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), "r", encoding="utf-8") as f:
            metric_info_dict = json.load(f)
        return cls.from_dict(metric_info_dict)

    @classmethod
    def from_dict(cls, metric_info_dict: dict) -> "MetricInfo":
        field_names = set(f.name for f in dataclasses.fields(cls))
        return cls(**{k: v for k, v in metric_info_dict.items() if k in field_names})