Sambert-UI / utils_label.py
kevinwang676's picture
Upload 4 files
39b6e8b
# 对 sambert 训练的数据标注处理
import os
import shutil
import uuid
import librosa
import gradio as gr
from scipy.io import wavfile
import numpy as np
import whisper
from modelscope.tools import run_auto_label
from utils_base import ensure_empty_dir, datasets_dir, get_dataset_list
# 绝对路径获取方法
curPath = os.path.dirname(os.path.abspath(__file__))
def getAbsPath (relativePath):
joinPath = os.path.join(curPath, relativePath)
return os.path.normpath(
os.path.abspath(joinPath)
)
# 初始化 whisper 模型的加载
model_path = getAbsPath('../../models/whisper/medium.pt')
whisper_model = None
if shutil.os.path.exists(model_path):
whisper_model = whisper.load_model(model_path)
else:
whisper_model = whisper.load_model('medium')
# whisper 音频分割方法 ----------------------------------------------
def split_long_audio(model, filepaths, save_path, out_sr=44100):
# 格式化输入的音频路径(兼容单个音频和多个音频)
if isinstance(filepaths, str):
filepaths = [filepaths]
# 对音频依次做拆分并存放到临时路径
for file_idx, filepath in enumerate(filepaths):
print(f"Transcribing file {file_idx}: '{filepath}' to segments...")
result = model.transcribe(filepath, word_timestamps=True, task="transcribe", beam_size=5, best_of=5)
segments = result['segments']
# 采用 librosa 配合 scipy 做音频数据分割
wav, sr = librosa.load(filepath, sr=None, offset=0, duration=None, mono=True)
wav, _ = librosa.effects.trim(wav, top_db=20)
peak = np.abs(wav).max()
if peak > 1.0:
wav = 0.98 * wav / peak
wav2 = librosa.resample(wav, orig_sr=sr, target_sr=out_sr)
wav2 /= max(wav2.max(), -wav2.min())
# 将长音频文件分割成一条条的短音频并放入指定的目录
for i, seg in enumerate(segments):
start_time = seg['start']
end_time = seg['end']
wav_seg = wav2[int(start_time * out_sr):int(end_time * out_sr)]
wav_seg_name = f"{file_idx}_{i}.wav"
out_fpath = os.path.join(save_path, wav_seg_name)
wavfile.write(out_fpath, rate=out_sr, data=(wav_seg * np.iinfo(np.int16).max).astype(np.int16))
# 自动标注与标注后的文件打包 --------------------------------------------
def auto_label(audio, name):
if not audio or not name:
return '', gr.update(choices=get_dataset_list())
# 创建临时目录用于存放分割后的音频与再次标注的信息
input_wav = getAbsPath(f'./temp/input-{ uuid.uuid4() }')
ensure_empty_dir(input_wav)
work_dir = os.path.join(datasets_dir, name)
ensure_empty_dir(work_dir)
# 音频分割
split_long_audio(whisper_model, audio, input_wav)
# 音频自动标注
# 第一次会自动下载对应的模型
run_auto_label(
input_wav=input_wav,
work_dir=work_dir,
resource_revision='v1.0.7'
)
# 移除目录
shutil.rmtree(input_wav)
# 返回结果
return '打标成功', gr.update(choices=get_dataset_list())
# 删除数据集 ----------------------------------------------------
# name - 删除的数据集名称
def delete_dataset(name):
try:
if not name:
return gr.update(choices=get_dataset_list())
target_dir = os.path.join(datasets_dir, name)
shutil.rmtree(target_dir)
return gr.update(choices=get_dataset_list(), value=None)
except Exception:
return gr.update(choices=get_dataset_list(), value=None)