Spaces:
Sleeping
Sleeping
from __future__ import annotations | |
from typing import TYPE_CHECKING, Sequence | |
import joblib | |
from tqdm import tqdm | |
if TYPE_CHECKING: | |
from pathlib import Path | |
__all__ = ["serialize", "deserialize"] | |
def serialize(data: Sequence[str], path: Path, max_size: int = 100000, show_progress: bool = False) -> None: | |
"""Serialize data to a file | |
Args: | |
data: The data to serialize | |
path: The path to save the serialized data | |
max_size: The maximum size a chunk can be (in elements) | |
show_progress: Whether to show a progress bar | |
""" | |
# first file is path, next chunks have ".1", ".2", etc. appended | |
for i, chunk in enumerate( | |
tqdm( | |
[data[i : i + max_size] for i in range(0, len(data), max_size)], | |
unit="chunk", | |
disable=not show_progress, | |
), | |
): | |
fd = path.with_suffix(f".{i}.pkl" if i else ".pkl") | |
with fd.open("wb") as f: | |
joblib.dump(chunk, f, compress=3) | |
def deserialize(path: Path) -> Sequence[str]: | |
"""Deserialize data from a file | |
Args: | |
path: The path to the serialized data | |
Returns: | |
The deserialized data | |
""" | |
data = [] | |
i = 0 | |
while (fd := path.with_suffix(f".{i}.pkl" if i else ".pkl")).exists(): | |
with fd.open("rb") as f: | |
data.extend(joblib.load(f)) | |
i += 1 | |
return data | |