|
import gradio as gr |
|
import numpy as np |
|
import pytubefix as pt |
|
import os, time, librosa, torch |
|
from pyannote.audio import Pipeline |
|
from transformers import pipeline |
|
import spaces |
|
|
|
|
|
def second_to_timecode(x: float) -> str: |
|
"""Float x second to HH:MM:SS.DDD format.""" |
|
hour, x = divmod(x, 3600) |
|
minute, x = divmod(x, 60) |
|
second, x = divmod(x, 1) |
|
millisecond = int(x * 1000.) |
|
|
|
return '%.2d:%.2d:%.2d,%.3d' % (hour, minute, second, millisecond) |
|
|
|
|
|
def download_from_youtube(youtube_link: str) -> str: |
|
yt = pt.YouTube(youtube_link) |
|
available_streams = yt.streams.filter(only_audio=True) |
|
print('available streams:') |
|
print(available_streams) |
|
stream = available_streams.first() |
|
|
|
|
|
stream.download(filename="audio.wav") |
|
return "audio.wav" |
|
|
|
|
|
MODEL_NAME = 'Dorjzodovsuren/whisper-large-v2-mn' |
|
|
|
lang = 'mn' |
|
|
|
chunk_length_s = 9 |
|
vad_activation_min_duration = 9 |
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
SAMPLE_RATE = 16_000 |
|
|
|
|
|
dia_model = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.environ['TOKEN']) |
|
vad_model = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=os.environ['TOKEN']) |
|
|
|
dia_model = dia_model.to(torch.device('cuda')) |
|
vad_model = vad_model.to(torch.device('cuda')) |
|
|
|
import torch |
|
from transformers import pipeline |
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
|
|
|
if MODEL_NAME == 'Dorjzodovsuren/whisper-large-v2-mn': |
|
processor = AutoProcessor.from_pretrained(MODEL_NAME) |
|
|
|
else: |
|
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3-turbo") |
|
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME) |
|
|
|
asr_pipeline = pipeline( |
|
"automatic-speech-recognition", |
|
model=model, |
|
tokenizer=processor.tokenizer, |
|
feature_extractor=processor.feature_extractor, |
|
chunk_length_s=chunk_length_s, |
|
device_map="auto" |
|
) |
|
|
|
lang = 'mn' |
|
asr_pipeline.model.config.forced_decoder_ids = asr_pipeline.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe") |
|
|
|
print("----------> Loaded models <-----------") |
|
|
|
gpu_timeout = int(os.getenv("GPU_TIMEOUT", 60)) |
|
@spaces.GPU(duration=gpu_timeout) |
|
def generator(file_upload, num_speakers, max_duration, history): |
|
history = history or "" |
|
|
|
path = file_upload |
|
waveform, sampling_rate = librosa.load(path, sr=SAMPLE_RATE, mono=True, duration=max_duration) |
|
|
|
print(waveform.shape, sampling_rate) |
|
waveform_tensor = torch.unsqueeze(torch.tensor(waveform), 0).to(device) |
|
|
|
dia_result = dia_model({ |
|
"waveform": waveform_tensor, |
|
"sample_rate": sampling_rate, |
|
}, num_speakers=num_speakers) |
|
|
|
counter = 1 |
|
|
|
for speech_turn, track, speaker in dia_result.itertracks(yield_label=True): |
|
print(f"{speech_turn.start:4.1f} {speech_turn.end:4.1f} {speaker}") |
|
_start = int(sampling_rate * speech_turn.start) |
|
_end = int(sampling_rate * speech_turn.end) |
|
data = waveform[_start: _end] |
|
|
|
if speech_turn.end - speech_turn.start > vad_activation_min_duration: |
|
print(f'audio duration {speech_turn.end - speech_turn.start} sec ----> activating VAD') |
|
vad_output = vad_model({ |
|
'waveform': waveform_tensor[:, _start:_end], |
|
'sample_rate': sampling_rate}) |
|
for vad_turn in vad_output.get_timeline().support(): |
|
vad_start = _start + int(sampling_rate * vad_turn.start) |
|
vad_end = _start + int(sampling_rate * vad_turn.end) |
|
prediction = asr_pipeline(waveform[vad_start: vad_end])['text'] |
|
history += f"{counter}\n" + \ |
|
f"{second_to_timecode(speech_turn.start + vad_turn.start)} --> {second_to_timecode(speech_turn.start + vad_turn.end)}\n" + \ |
|
f"{prediction}\n\n" |
|
|
|
yield history, history, None |
|
counter += 1 |
|
|
|
else: |
|
prediction = asr_pipeline(data)['text'] |
|
history += f"{counter}\n" + \ |
|
f"{second_to_timecode(speech_turn.start)} --> {second_to_timecode(speech_turn.end)}\n" + \ |
|
f"{prediction}\n\n" |
|
|
|
counter += 1 |
|
yield history, history, None |
|
|
|
|
|
file_name = 'transcript.srt' |
|
with open(file_name, 'w') as fp: |
|
fp.write(history) |
|
|
|
yield history, history, file_name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Mongolian Whisper 🇲🇳") as demo: |
|
with gr.Column(): |
|
|
|
audio_input = gr.Audio(type="filepath", label="Дуу") |
|
|
|
with gr.Accordion("Нэмэлт тохиргоо", open=False): |
|
speakers_input = gr.Number(value=1, label="Яриж буй нийт хүний тоо хэд вэ?") |
|
duration_input = gr.Slider( |
|
minimum=0, |
|
maximum=300, |
|
step=1, |
|
value=120, |
|
label="Дууны хамгийн урт хэмжээ (Seconds) хэд вэ?" |
|
) |
|
state_input = gr.State() |
|
|
|
|
|
text_output = gr.Textbox(label="Текст хөрвүүлгэ") |
|
state_output = gr.State() |
|
file_output = gr.File(label="Үр дүнгийн файл") |
|
|
|
|
|
audio_input.change( |
|
fn=generator, |
|
inputs=[audio_input, speakers_input, duration_input, state_input], |
|
outputs=[text_output, state_output, file_output], |
|
) |
|
|
|
|
|
|
|
demo.queue() |
|
|
|
demo.launch(debug=True) |