import json import re from typing import Union, Dict from pathlib import Path import os MAX_FILE_NAME_LENGTH = 100 def read_jsonl_to_mapping( jsonl_file: Union[str, Path], key_col: str, value_col: str, base_path=None, overwrite=True, ) -> Dict[str, str]: """ Read two columns, indicated by `key_col` and `value_col`, from the given jsonl file to return the mapping dict TODO handle duplicate keys """ mapping = {} with open(jsonl_file, 'r') as file: for line in file.readlines(): data = json.loads(line.strip()) key = data[key_col] value = data[value_col] if base_path: value = os.path.join(base_path, value) if key not in mapping.keys() or overwrite: mapping[key] = value return mapping def sanitize_filename(name: str, max_len: int = MAX_FILE_NAME_LENGTH) -> str: """ Clean and truncate a string to make it a valid and safe filename. """ name = re.sub(r'[\\/*?:"<>|]', '_', name) name = name.replace('/', '_') max_len = min(len(name), max_len) return name[:max_len] def transform_gen_fn_to_id(audio_file: Path, task: str) -> str: if task == "svs": audio_id = audio_file.stem.split("_")[0] elif task == "sr": audio_id = audio_file.stem elif task == "tta": audio_id = audio_file.stem[:12] + '.wav' elif task == "ttm": audio_id = audio_file.stem[:11] # audio_id = audio_file.stem[:12] + '.wav' elif task == "v2a": audio_id = audio_file.stem.rsplit("_", 1)[0] + ".mp4" elif task == "sta_test" or task == "tta_test": audio_id = audio_file.stem[:12] + '.wav' elif task == "sta_base": audio_id = 'Y' + audio_file.stem[:11] + '.wav' else: audio_id = audio_file.stem return audio_id def audio_dir_to_mapping(audio_dir: str | Path, task: str) -> dict: mapping = {} audio_dir = Path(audio_dir) audio_files = sorted(audio_dir.iterdir()) for audio_file in audio_files: audio_id = transform_gen_fn_to_id(audio_file, task) mapping[audio_id] = str(audio_file.resolve()) return mapping