xc_expt

Sleeping

App Files Files Community

xc_expt / extract_list.py

isuneast

init

37ab3e3 5 months ago

raw history blame contribute delete

No virus

2.34 kB

	import argparse
	import os

	from tools.log import logger


	def extract_list(folder_path, language, name, transcript_txt_file):
	logger.info(f"正在提取文本: {folder_path}\|{name}\|{language}")
	current_dir = os.getcwd()
	relative_path = os.path.relpath(folder_path, current_dir)
	print(relative_path)
	os.makedirs(os.path.dirname(transcript_txt_file), exist_ok=True)
	with open(transcript_txt_file, "w", encoding="utf-8") as f:
	# 遍历 raw 文件夹下的所有子文件夹
	for root, _, files in os.walk(relative_path):
	for file in files:
	if file.endswith(".lab"):
	lab_file_path = os.path.join(root, file)
	# 读取转写文本
	with open(lab_file_path, "r", encoding="utf-8") as lab_file:
	transcription = lab_file.read().strip()
	if len(transcription) == 0:
	continue
	# 获取对应的 WAV 文件路径
	# ./Data/宵宫/audios/raw
	# ./Data/宵宫/audios/wavs
	wav_file_path = os.path.splitext(lab_file_path)[0] + ".wav"
	if os.path.isfile(wav_file_path):
	wav_file_path = wav_file_path.replace("\\", "/").replace(
	"/raw", "/wavs"
	)
	# 写入数据到总的转写文本文件
	line = f"{wav_file_path}\|{name}\|{language}\|{transcription}\n"
	f.write(line)
	else:
	print("不存在对应音频!")
	return f"转写文本 {transcript_txt_file} 生成完成"


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"-f",
	"--filepath",
	required=True,
	help="path of your rawaudios, e.g. ./Data/xxx/audios/raw",
	)
	parser.add_argument("-l", "--language", default="ZH", help="language")
	parser.add_argument("-n", "--name", required=True, help="name of the character")
	parser.add_argument("-o", "--outfile", required=True, help="outfile")
	args = parser.parse_args()

	status_str = extract_list(args.filepath, args.language, args.name, args.outfile)
	logger.critical(status_str)