Spaces:
Build error
Build error
import whisper | |
import os | |
import json | |
import torchaudio | |
import argparse | |
import torch | |
from config import config | |
import yaml | |
with open('config.yml', mode="r", encoding="utf-8") as f: | |
configyml=yaml.load(f,Loader=yaml.FullLoader) | |
model_name = configyml["dataset_path"].replace("Data/","") | |
lang2token = { | |
'zh': "ZH|", | |
'ja': "JP|", | |
"en": "EN|", | |
} | |
def transcribe_one(audio_path): | |
# load audio and pad/trim it to fit 30 seconds | |
audio = whisper.load_audio(audio_path) | |
audio = whisper.pad_or_trim(audio) | |
# make log-Mel spectrogram and move to the same device as the model | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
# detect the spoken language | |
_, probs = model.detect_language(mel) | |
print(f"Detected language: {max(probs, key=probs.get)}") | |
lang = max(probs, key=probs.get) | |
# decode the audio | |
options = whisper.DecodingOptions(beam_size=5) | |
result = whisper.decode(model, mel, options) | |
# print the recognized text | |
print(result.text) | |
return lang, result.text | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--languages", default="CJ") | |
parser.add_argument("--whisper_size", default="medium") | |
args = parser.parse_args() | |
if args.languages == "CJE": | |
lang2token = { | |
'zh': "ZH|", | |
'ja': "JP|", | |
"en": "EN|", | |
} | |
elif args.languages == "CJ": | |
lang2token = { | |
'zh': "ZH|", | |
'ja': "JP|", | |
} | |
elif args.languages == "C": | |
lang2token = { | |
'zh': "ZH|", | |
} | |
assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!" | |
model = whisper.load_model(args.whisper_size) | |
#parent_dir = "./custom_character_voice/" | |
parent_dir=config.resample_config.in_dir | |
parent_dir = parent_dir.replace("/audios","") | |
print(parent_dir) | |
speaker = model_name | |
speaker_annos = [] | |
total_files = sum([len(files) for r, d, files in os.walk(parent_dir)]) | |
# resample audios | |
# 2023/4/21: Get the target sampling rate | |
with open(config.train_ms_config.config_path,'r', encoding='utf-8') as f: | |
hps = json.load(f) | |
target_sr = hps['data']['sampling_rate'] | |
processed_files = 0 | |
for i, wavfile in enumerate(list(os.walk(parent_dir))[0][2]): | |
# try to load file as audio | |
# if wavfile.startswith("processed_"): | |
# continue | |
try: | |
# wav, sr = torchaudio.load(parent_dir + "/" + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True, | |
# channels_first=True) | |
# wav = wav.mean(dim=0).unsqueeze(0) | |
# if sr != target_sr: | |
# wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav) | |
# if wav.shape[1] / sr > 20: | |
# print(f"{wavfile} too long, ignoring\n") | |
#save_path = parent_dir+"/"+ speaker + "/" + f"ada_{i}.wav" | |
# torchaudio.save(save_path, wav, target_sr, channels_first=True) | |
# transcribe text | |
lang, text = transcribe_one(f"./Data/{speaker}/raw/{wavfile}") | |
if lang not in list(lang2token.keys()): | |
print(f"{lang} not supported, ignoring\n") | |
continue | |
#text = "ZH|" + text + "\n" | |
text = f"./Data/{model_name}/wavs/{wavfile}|" + f"{model_name}|" +lang2token[lang] + text + "\n" | |
speaker_annos.append(text) | |
processed_files += 1 | |
print(f"Processed: {processed_files}/{total_files}") | |
except Exception as e: | |
print(e) | |
continue | |
# # clean annotation | |
# import argparse | |
# import text | |
# from utils import load_filepaths_and_text | |
# for i, line in enumerate(speaker_annos): | |
# path, sid, txt = line.split("|") | |
# cleaned_text = text._clean_text(txt, ["cjke_cleaners2"]) | |
# cleaned_text += "\n" if not cleaned_text.endswith("\n") else "" | |
# speaker_annos[i] = path + "|" + sid + "|" + cleaned_text | |
# write into annotation | |
if len(speaker_annos) == 0: | |
print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.") | |
print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.") | |
with open(config.preprocess_text_config.transcription_path, 'w', encoding='utf-8') as f: | |
for line in speaker_annos: | |
f.write(line) | |