Spaces:
Runtime error
Runtime error
| import json | |
| import re | |
| from typing import Union, Dict | |
| from pathlib import Path | |
| import os | |
| MAX_FILE_NAME_LENGTH = 100 | |
| def read_jsonl_to_mapping( | |
| jsonl_file: Union[str, Path], | |
| key_col: str, | |
| value_col: str, | |
| base_path=None, | |
| overwrite=True, | |
| ) -> Dict[str, str]: | |
| """ | |
| Read two columns, indicated by `key_col` and `value_col`, from the | |
| given jsonl file to return the mapping dict | |
| TODO handle duplicate keys | |
| """ | |
| mapping = {} | |
| with open(jsonl_file, 'r') as file: | |
| for line in file.readlines(): | |
| data = json.loads(line.strip()) | |
| key = data[key_col] | |
| value = data[value_col] | |
| if base_path: | |
| value = os.path.join(base_path, value) | |
| if key not in mapping.keys() or overwrite: | |
| mapping[key] = value | |
| return mapping | |
| def sanitize_filename(name: str, max_len: int = MAX_FILE_NAME_LENGTH) -> str: | |
| """ | |
| Clean and truncate a string to make it a valid and safe filename. | |
| """ | |
| name = re.sub(r'[\\/*?:"<>|]', '_', name) | |
| name = name.replace('/', '_') | |
| max_len = min(len(name), max_len) | |
| return name[:max_len] | |
| def transform_gen_fn_to_id(audio_file: Path, task: str) -> str: | |
| if task == "svs": | |
| audio_id = audio_file.stem.split("_")[0] | |
| elif task == "sr": | |
| audio_id = audio_file.stem | |
| elif task == "tta": | |
| audio_id = audio_file.stem[:12] + '.wav' | |
| elif task == "ttm": | |
| audio_id = audio_file.stem[:11] | |
| # audio_id = audio_file.stem[:12] + '.wav' | |
| elif task == "v2a": | |
| audio_id = audio_file.stem.rsplit("_", 1)[0] + ".mp4" | |
| elif task == "sta_test" or task == "tta_test": | |
| audio_id = audio_file.stem[:12] + '.wav' | |
| elif task == "sta_base": | |
| audio_id = 'Y' + audio_file.stem[:11] + '.wav' | |
| else: | |
| audio_id = audio_file.stem | |
| return audio_id | |
| def audio_dir_to_mapping(audio_dir: str | Path, task: str) -> dict: | |
| mapping = {} | |
| audio_dir = Path(audio_dir) | |
| audio_files = sorted(audio_dir.iterdir()) | |
| for audio_file in audio_files: | |
| audio_id = transform_gen_fn_to_id(audio_file, task) | |
| mapping[audio_id] = str(audio_file.resolve()) | |
| return mapping | |