import spaces
import os
import gc
from functools import partial
import gradio as gr
import torch
from speechbrain.inference.interfaces import Pretrained, foreign_class
from transformers import T5Tokenizer, T5ForConditionalGeneration
import librosa
import whisper_timestamped as whisper
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cuda.matmul.allow_tf32 = True
def clean_up_memory():
def recap_sentence(string):
inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
return recap_result
def return_prediction_w2v2(mic=None, file=None, device=device):
if mic is not None:
waveform, sr = librosa.load(mic, sr=16000)
waveform = waveform[:60*sr]
w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
elif file is not None:
waveform, sr = librosa.load(file, sr=16000)
waveform = waveform[:60*sr]
w2v2_result = w2v2_classifier.classify_file_w2v2(waveform, device)
return "You must either provide a mic recording or a file"
recap_result = recap_sentence(w2v2_result[0])
for i, letter in enumerate(recap_result):
if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
return recap_result
def return_prediction_whisper_mic(mic=None, device=device):
if mic is not None:
waveform, sr = librosa.load(mic, sr=16000)
waveform = waveform[:30*sr]
whisper_result = whisper_classifier.classify_file_whisper_mkd(waveform, device)
return "You must provide a mic recording"
recap_result = recap_sentence(whisper_result[0])
for i, letter in enumerate(recap_result):
if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
return recap_result
def return_prediction_whisper_file(file=None, device=device):
whisper_result = []
if file is not None:
waveform, sr = librosa.load(file, sr=16000)
waveform = waveform[:3600*sr]
whisper_result = whisper_classifier.classify_file_whisper_mkd_streaming(waveform, device)
yield "You must provide a file"
recap_result = ""
prev_segment = ""
prev_segment_len = 0
segment_counter = 0
for segment in whisper_result:
segment_counter += 1
if prev_segment == "":
recap_segment = recap_sentence(segment[0])
prev_segment_len = len(prev_segment.split())
recap_segment = recap_sentence(prev_segment + " " + segment[0])
recap_segment = recap_segment.split()
recap_segment = recap_segment[prev_segment_len:]
recap_segment = " ".join(recap_segment)
prev_segment = segment[0]
recap_result += recap_segment + " "
for i, letter in enumerate(recap_result):
if i > 1 and recap_result[i-2] in [".", "!", "?"] and letter.islower():
recap_result = recap_result[:i] + letter.upper() + recap_result[i+1:]
yield recap_result
return_prediction_whisper_mic_with_device = partial(return_prediction_whisper_mic, device=device)
return_prediction_whisper_file_with_device = partial(return_prediction_whisper_file, device=device)
return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
# Load the ASR models
whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="", classname="ASR")
whisper_classifier =
w2v2_classifier = foreign_class(source="Macedonian-ASR/wav2vec2-aed-macedonian-asr", pymodule_file="", classname="ASR")
w2v2_classifier =
# Load the T5 tokenizer and model
recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
# Interface definitions
mic_transcribe_whisper = gr.Interface(
inputs=gr.Audio(sources="microphone", type="filepath"),
file_transcribe_whisper = gr.Interface(
inputs=gr.Audio(sources="upload", type="filepath"),
mic_transcribe_w2v2 = gr.Interface(
inputs=gr.Audio(sources="microphone", type="filepath"),
file_transcribe_w2v2 = gr.Interface(
inputs=gr.Audio(sources="upload", type="filepath"),
project_description = '''
<img src=""
alt="Bookie logo"
style="float: right; width: 130px; height: 110px; margin-left: 10px;" />
## Автори:
1. **Дејан Порјазовски**
2. **Илина Јакимовска**
3. **Ордан Чукалиев**
4. **Никола Стиков**
Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](** при УКИМ.
## Во тренирањето на овој модел се употребени податоци од:
1. Дигитален архив за етнолошки и антрополошки ресурси ([ДАЕАР]( при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
2. Аудио верзија на меѓународното списание [„ЕтноАнтропоЗум"]( на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
3. Аудио подкастот [„Обични луѓе"]( на Илина Јакимовска
4. Научните видеа од серијалот [„Наука за деца"](, фондација [КАНТАРОТ](
5. Македонска верзија на [Mozilla Common Voice]( (верзија 18.0)
## Како да придонесете за подобрување на македонските модели за препознавање на говор?
На следниот [линк]( ќе најдете инструкции за тоа како да донирате македонски говор преку платформата Mozilla Common Voice.
# Custom CSS
css = """
.gradio-container {
background-color: #f0f0f0;
.custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
font-size: 15px !important;
font-family: Arial, sans-serif !important;
.gradio-container {
background-color: #f3f3f3 !important;
transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
with transcriber_app:
state = gr.State()
gr.Markdown(project_description, elem_classes="custom-markdown")
[mic_transcribe_whisper, file_transcribe_whisper, mic_transcribe_w2v2, file_transcribe_w2v2],
["Буки-Whisper микрофон", "Буки-Whisper датотека", "Буки-Wav2vec2 микрофон", "Буки-Wav2vec2 датотека"],
state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
if __name__ == "__main__":
transcriber_app.launch(share=True) |