import gradio as gr import urllib.request import re import whisper import os import requests # Whisper: Speech-to-text model = whisper.load_model("base") model_med = whisper.load_model("medium") # Whisper - speech-to-text def whisper_stt(audio): print("Inside Whisper TTS") # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model mel = whisper.log_mel_spectrogram(audio).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) lang = max(probs, key=probs.get) print(f"Detected language: {max(probs, key=probs.get)}") # decode the audio options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') #lang options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') #lang result_transc = whisper.decode(model_med, mel, options_transc) result_transl = whisper.decode(model_med, mel, options_transl) # print the recognized text print(f"transcript is : {result_transc.text}") print(f"translation is : {result_transl.text}") return result_transc.text, result_transl.text, lang def engine(audio): # Get voice query to text transcribe, translation, lang = whisper_stt(audio) # Get Video result video_url, video_id = get_youtube_video(translation) # Display video html_out1, html_out2 = display_vid(video_id) return html_out1, html_out2, translation def get_youtube_video(query): search_term = '+'.join(query.split()) html = urllib.request.urlopen("https://www.youtube.com/results?sp=mAEB&search_query="+search_term) # samples : "https://www.youtube.com/results?sp=mAEB&search_query=openai+whisper+model" # samples : "https://www.youtube.com/results?sp=mAEB&search_query=gradio+for+ml+model+frontend" video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode()) # samples : https://www.youtube.com/watch?v=HbY51mVKrcE video_url = "https://www.youtube.com/watch?v="+video_ids[0] return video_url, video_ids def display_vid(video_ids): print("******** display_vid ********") # created embedding html_out1 = '' html_out2 = '' print(f"html output 1 is : {html_out1}") print(f"html output 2 is : {html_out2}") return html_out1, html_out2 # not using currently def set_example_question(sample_question): print(f"******* Inside Sample Questions ********") print(f"Sample Question coming from Radio box is : {sample_question}") print("What is the Return value : {gr.Radio.update(value=sample_question)}") return gr.Radio.update(value=sample_question) #input_ques.update(example) demo = gr.Blocks() with demo: gr.Markdown("

Voice to Youtube Search

") gr.Markdown( """
Whisper powered voice search app to search and display Youtube Videos on Spaces.

Best Part you don't have to step away from Spaces !
""" ) with gr.Column(): with gr.Row(): in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice search query here in English, Spanish or French for best results-') b1 = gr.Button("Whisper powered Youtube search") out_textbox = gr.Textbox(label="Whisper Specch to Text transcript") with gr.Row(): output_vid1 = gr.HTML(label="First Video Result from Youtube", show_label=True) output_vid2 = gr.HTML(label="Second Video Result from Youtube", show_label=True) b1.click(engine, inputs=[in_audio], outputs=[output_vid1, output_vid2, out_textbox]) with gr.Row(): gr.Markdown("![visitor badge](https://visitor-badge.glitch.me/badge?page_id=ysharma_Voice-to-Youtube)") demo.launch(enable_queue=True, debug=True)