Spaces:
Running
Running
| from typing import List, Union | |
| import ray | |
| from ray.data import Dataset | |
| from graphgen.bases.base_reader import BaseReader | |
| class CSVReader(BaseReader): | |
| """ | |
| Reader for CSV files. | |
| Columns: | |
| - type: The type of the document (e.g., "text", "image", etc.) | |
| - if type is "text", "content" column must be present. | |
| """ | |
| def read(self, input_path: Union[str, List[str]]) -> Dataset: | |
| """ | |
| Read CSV files and return Ray Dataset. | |
| :param input_path: Path to CSV file or list of CSV files. | |
| :return: Ray Dataset containing validated and filtered data. | |
| """ | |
| ds = ray.data.read_csv(input_path) | |
| ds = ds.map_batches(self._validate_batch, batch_format="pandas") | |
| ds = ds.filter(self._should_keep_item) | |
| return ds | |