File size: 3,448 Bytes
6b2dcd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
# Copyright (c) Louis Brulé Naudet. All Rights Reserved.
# This software may be used and distributed according to the terms of the License Agreement.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datasets
import polars as pl


class Dataset:
    @staticmethod
    def load(
        dataset_path:str
    ):
        """
        Load a dataset from disk.

        Parameters
        ----------
        dataset_path : str
            The path to the dataset on disk.

        Returns
        -------
        datasets.Dataset
            The loaded dataset.

        Notes
        -----
        This method statically loads a dataset from disk using the `load_from_disk` function
        provided by the `datasets` module. The dataset is expected to be stored in a specific
        format supported by the `datasets` library.

        Example
        -------
        >>> dataset_path = "/path/to/dataset"
        >>> dataset = Dataset.load(dataset_path)
        """
        dataset = datasets.load_from_disk(
            dataset_path=dataset_path
        )

        return dataset


    @staticmethod
    def save(
        dataset: datasets.Dataset,
        dataset_path: str
    ) -> None:
        """
        Save a dataset to disk.

        Parameters
        ----------
        dataset : datasets.Dataset
            The dataset to be saved.

        dataset_path : str
            The path where the dataset will be saved on disk.

        Returns
        -------
        None

        Notes
        -----
        This method statically saves a dataset to disk using the `save_to_disk` function
        provided by the `datasets` module. The dataset is expected to be in a format
        supported by the `datasets` library.

        Example
        -------
        >>> dataset = load_dataset("my_dataset")
        >>> dataset_path = "/path/to/save/dataset"
        >>> Dataset.save(dataset, dataset_path)
        """
        datasets.save_to_disk(
            dataset, 
            dataset_path
        )

        return None

    @staticmethod
    def convert_to_polars(
        dataset: datasets.Dataset
    ) -> pl.DataFrame:
        """
        Convert a dataset to a Polars DataFrame.

        Parameters
        ----------
        dataset : datasets.Dataset
            The dataset to be converted to a Polars DataFrame.

        Returns
        -------
        pl.DataFrame
            A Polars DataFrame representing the dataset.

        Notes
        -----
        This method converts a dataset object to a Polars DataFrame, which is a
        memory-efficient and fast data manipulation library for Rust.

        Raises
        ------
        Exception
            If an error occurs during the conversion process.

        Examples
        --------
        >>> dataset = datasets.Dataset(data=arrow_table)
        >>> dataframe = ClassName.convert_to_polars(dataset)
        """
        try:
            dataframe = pl.from_arrow(dataset.data.table).with_row_index()

        except:
            dataframe = pl.from_arrow(dataset.data.table).with_row_count(
                name="index"
            )

        return dataframe