Spaces:

assasinatee
/

STAR

Runtime error

STAR / utils /general.py

Yixuan Li

first commit

4853fdc about 1 month ago

2.2 kB

	import json
	import re
	from typing import Union, Dict
	from pathlib import Path
	import os

	MAX_FILE_NAME_LENGTH = 100


	def read_jsonl_to_mapping(
	jsonl_file: Union[str, Path],
	key_col: str,
	value_col: str,
	base_path=None,
	overwrite=True,
	) -> Dict[str, str]:
	"""
	Read two columns, indicated by `key_col` and `value_col`, from the
	given jsonl file to return the mapping dict
	TODO handle duplicate keys
	"""
	mapping = {}
	with open(jsonl_file, 'r') as file:
	for line in file.readlines():
	data = json.loads(line.strip())
	key = data[key_col]
	value = data[value_col]
	if base_path:
	value = os.path.join(base_path, value)
	if key not in mapping.keys() or overwrite:
	mapping[key] = value
	return mapping


	def sanitize_filename(name: str, max_len: int = MAX_FILE_NAME_LENGTH) -> str:
	"""
	Clean and truncate a string to make it a valid and safe filename.
	"""
	name = re.sub(r'[\\/*?:"<>\|]', '_', name)
	name = name.replace('/', '_')
	max_len = min(len(name), max_len)
	return name[:max_len]


	def transform_gen_fn_to_id(audio_file: Path, task: str) -> str:
	if task == "svs":
	audio_id = audio_file.stem.split("_")[0]
	elif task == "sr":
	audio_id = audio_file.stem
	elif task == "tta":
	audio_id = audio_file.stem[:12] + '.wav'
	elif task == "ttm":
	audio_id = audio_file.stem[:11]
	# audio_id = audio_file.stem[:12] + '.wav'
	elif task == "v2a":
	audio_id = audio_file.stem.rsplit("_", 1)[0] + ".mp4"
	elif task == "sta_test" or task == "tta_test":
	audio_id = audio_file.stem[:12] + '.wav'
	elif task == "sta_base":
	audio_id = 'Y' + audio_file.stem[:11] + '.wav'
	else:
	audio_id = audio_file.stem
	return audio_id


	def audio_dir_to_mapping(audio_dir: str \| Path, task: str) -> dict:
	mapping = {}
	audio_dir = Path(audio_dir)
	audio_files = sorted(audio_dir.iterdir())
	for audio_file in audio_files:
	audio_id = transform_gen_fn_to_id(audio_file, task)
	mapping[audio_id] = str(audio_file.resolve())
	return mapping