|
from moviepy.editor import AudioFileClip |
|
import whisper |
|
import os |
|
import json |
|
import torchaudio |
|
import librosa |
|
import torch |
|
import argparse |
|
parent_dir = "./denoised_audio/" |
|
filelist = list(os.walk(parent_dir))[0][2] |
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--languages", default="CJE") |
|
parser.add_argument("--whisper_size", default="medium") |
|
args = parser.parse_args() |
|
if args.languages == "CJE": |
|
lang2token = { |
|
'zh': "[ZH]", |
|
'ja': "[JA]", |
|
"en": "[EN]", |
|
} |
|
elif args.languages == "CJ": |
|
lang2token = { |
|
'zh': "[ZH]", |
|
'ja': "[JA]", |
|
} |
|
elif args.languages == "C": |
|
lang2token = { |
|
'zh': "[ZH]", |
|
} |
|
assert(torch.cuda.is_available()), "Please enable GPU in order to run Whisper!" |
|
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f: |
|
hps = json.load(f) |
|
target_sr = hps['data']['sampling_rate'] |
|
model = whisper.load_model(args.whisper_size) |
|
speaker_annos = [] |
|
for file in filelist: |
|
print(f"transcribing {parent_dir + file}...\n") |
|
options = dict(beam_size=5, best_of=5) |
|
transcribe_options = dict(task="transcribe", **options) |
|
result = model.transcribe(parent_dir + file, word_timestamps=True, **transcribe_options) |
|
segments = result["segments"] |
|
|
|
lang = result['language'] |
|
if result['language'] not in list(lang2token.keys()): |
|
print(f"{lang} not supported, ignoring...\n") |
|
continue |
|
|
|
character_name = file.rstrip(".wav").split("_")[0] |
|
code = file.rstrip(".wav").split("_")[1] |
|
if not os.path.exists("./segmented_character_voice/" + character_name): |
|
os.mkdir("./segmented_character_voice/" + character_name) |
|
wav, sr = torchaudio.load(parent_dir + file, frame_offset=0, num_frames=-1, normalize=True, |
|
channels_first=True) |
|
|
|
for i, seg in enumerate(result['segments']): |
|
start_time = seg['start'] |
|
end_time = seg['end'] |
|
text = seg['text'] |
|
text = lang2token[lang] + text.replace("\n", "") + lang2token[lang] |
|
text = text + "\n" |
|
wav_seg = wav[:, int(start_time*sr):int(end_time*sr)] |
|
wav_seg_name = f"{character_name}_{code}_{i}.wav" |
|
savepth = "./segmented_character_voice/" + character_name + "/" + wav_seg_name |
|
speaker_annos.append(savepth + "|" + character_name + "|" + text) |
|
print(f"Transcribed segment: {speaker_annos[-1]}") |
|
|
|
|
|
torchaudio.save(savepth, wav_seg, target_sr, channels_first=True) |
|
if len(speaker_annos) == 0: |
|
print("Warning: no long audios & videos found, this IS expected if you have only uploaded short audios") |
|
print("this IS NOT expected if you have uploaded any long audios, videos or video links. Please check your file structure or make sure your audio/video language is supported.") |
|
with open("./long_character_anno.txt", 'w', encoding='utf-8') as f: |
|
for line in speaker_annos: |
|
f.write(line) |
|
|