# import gradio as gr # gr.Interface.load("models/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset").launch() import gradio as gr import os import transformers from transformers import pipeline, Wav2Vec2ForCTC,Wav2Vec2Processor import time import torch # def greet_from_secret(ignored_param): # name = os.environ.get('TOKEN') # return auth_token = os.environ.get('TOKEN') M1 = "rohitp1/dgx1_w2v2_base_teacher_student_distillation_mozilla_epochs_100_batch_16_concatenate_datasets" M2 = "rohitp1/finetune_teacher_babble_noise_mozilla_200_epochs" M3 = "rohitp1/finetune_teacher_clean_mozilla_200_epochs" model1 = Wav2Vec2ForCTC.from_pretrained(M1, use_auth_token=auth_token) processor1 = Wav2Vec2Processor.from_pretrained(M1, use_auth_token=auth_token) model2 = Wav2Vec2ForCTC.from_pretrained(M2, use_auth_token=auth_token) processor2 = Wav2Vec2Processor.from_pretrained(M2, use_auth_token=auth_token) model3 = Wav2Vec2ForCTC.from_pretrained(M3, use_auth_token=auth_token) processor3 = Wav2Vec2Processor.from_pretrained(M3, use_auth_token=auth_token) # make quantized model quantized_model1 = torch.quantization.quantize_dynamic( model3, {torch.nn.Linear}, dtype=torch.qint8 ) p1 = pipeline('automatic-speech-recognition', model=model1, processor=processor1) p2 = pipeline('automatic-speech-recognition', model=model2, processor=processor2) p3 = pipeline('automatic-speech-recognition', model=model3, processor=processor3) p1_quant = pipeline('automatic-speech-recognition', model=quantized_model1, processor=processor1) def transcribe(mic_input, upl_input, model_type): if mic_input: audio = mic_input else: audio = upl_input time.sleep(3) st_time = time.time() if model_type == 'NoisyFinetuned': text = p2(audio)["text"] elif model_type == 'CleanFinetuned': text = p3(audio)["text"] elif model_type == 'DistilledQuantised': text = p1_quant(audio)['text'] else: text = p1(audio)["text"] end_time = time.time() # state = text + " " time_taken = round((end_time - st_time) / 60 , 4) return text, time_taken # gr.Interface( # fn=transcribe, # inputs=[ # gr.inputs.Audio(source="microphone", type="filepath"), # 'state' # ], # outputs=[ # "textbox", # "state" # ], # live=False).launch() # demo = gr.load( # "huggingface/rohitp1/kkkh_whisper_small_distillation_att_loss_libri360_epochs_100_batch_4_concat_dataset", # title="Speech-to-text", # inputs="mic", # description="Let me try to guess what you're saying!", # api_key="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" # ) # demo.launch() def clear_inputs_and_outputs(): return [None, None, "CleanFinetuned", None, None] # Main function if __name__ == "__main__": demo = gr.Blocks() with demo: gr.Markdown( """