import torch # os.system("pip install git+https://github.com/openai/whisper.git") import gradio as gr import whisper import librosa import plotly.express as px from threading import Thread from statistics import mode, mean import time model = whisper.load_model("medium", device='cpu') print('loaded whisper') vad, vad_utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, onnx=False) print('loaded silero') (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = vad_utils vad_iterator = VADIterator(vad) global x, y, j, audio_vec, transcribe, STOP, languages, not_detected, main_lang, STARTED x = [] y = [] j = 0 STOP = False audio_vec = torch.tensor([]) transcribe = '' languages = [] not_detected = True main_lang = '' STARTED = False css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .gr-button { color: white; border-color: black; background: black; } input[type='range'] { accent-color: black; } .dark input[type='range'] { accent-color: #dfdfdf; } .container { max-width: 730px; margin: auto; padding-top: 1.5rem; } .details:hover { text-decoration: underline; } .gr-button { white-space: nowrap; } .gr-button:focus { border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1; --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); --tw-ring-opacity: .5; } .footer { margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5; } .footer>p { font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white; } .dark .footer { border-color: #303030; } .dark .footer>p { background: #0b0f19; } .prompt h4{ margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%; } .animate-spin { animation: spin 1s linear infinite; } @keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } #share-btn-container { display: flex; margin-top: 1.5rem !important; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; } #share-btn { all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; } #share-btn * { all: unset; } """ # def transcribe_chunk(): # print('********************************') # global audio_vec, transcribe, STOP # print('Enter trans chunk') # counter = 0 # i = 0 # while not STOP: # if audio_vec.size()[0] // 32000 > counter and audio_vec.size()[0] > 0: # print('audio_vec.size()[0] % 32000', audio_vec.size()[0] % 32000) # print('audio size', audio_vec.size()[0]) # chunk = whisper.pad_or_trim(audio_vec[32000*counter: 32000*(counter + 1)]) # mel_th = whisper.log_mel_spectrogram(chunk).to(model.device) # options = whisper.DecodingOptions(fp16=False) # result = whisper.decode(model, mel_th, options) # no_speech_prob = result.no_speech_prob # if no_speech_prob < 0.4: # transcribe += result.text + ' ' # counter += 1 def transcribe_chunk(audio, vad_prob): global languages trnscrb = '' audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) options = whisper.DecodingOptions(fp16= False, task='transcribe') result = whisper.decode(model, mel, options) no_speech_prob = result.no_speech_prob mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) temp_lang = max(probs, key=probs.get) print(result.text, "no_speech_prob: ",no_speech_prob, 1 - vad_prob) if no_speech_prob < 0.6: trnscrb = result.text + ' ' languages.append(temp_lang) if len(languages) > 3: languages.pop(0) return trnscrb def inference(audio): global x, y, j, audio_vec, transcribe, languages, not_detected, main_lang, STARTED print('enter inference') if j == 0: thread.start() STARTED = True wav2 = whisper.load_audio(audio, sr=16000) wav = torch.from_numpy(librosa.load(audio, sr=16000)[0]) audio_vec = torch.cat((audio_vec, wav)) speech_probs = [] window_size_samples = 1600 for i in range(0, len(wav), window_size_samples): chunk = wav[i: i + window_size_samples] if len(chunk) < window_size_samples: break speech_prob = vad(chunk, 16000).item() speech_probs.append(speech_prob) vad_iterator.reset_states() sample_per_sec = 16000 / window_size_samples x.extend([j + i / sample_per_sec for i in range(len(speech_probs))]) y.extend(speech_probs) j = max(x) fig = px.line(x=x, y=y) whisper_audio = whisper.pad_or_trim(wav2) mel = whisper.log_mel_spectrogram(whisper_audio).to(model.device) _, probs = model.detect_language(mel) temp_lang = max(probs, key=probs.get) print(temp_lang) languages.append(temp_lang) if len(languages) > 5: languages.pop(0) curr_lang = mode(languages) print(curr_lang, languages) if curr_lang == 'iw': return 'he', fig, gr.update(visible=True), transcribe, gr.update(visible=True), gr.update(visible=True) return curr_lang, fig, gr.update(visible=True), transcribe, gr.update(visible=True), gr.update(visible=True) def clear(): global x, y, j, audio_vec, transcribe, thread, STOP, languages, main_lang, not_detected ,STARTED STOP = True if STARTED: thread.join() STARTED = False x = [] y = [] j = 0 audio_vec = torch.tensor([]) transcribe = '' STOP = False languages = [] main_lang = '' not_detected = True thread = Thread(target=transcribe_chunk) print('clean:', x, y, j, transcribe, audio_vec) return '', gr.update(visible=False), gr.update(visible=False), '', gr.update(visible=False), gr.update(visible=False), def inference_file(audio): time.sleep(0.8) global x, y, j, audio_vec, transcribe, languages, not_detected, main_lang wav = torch.from_numpy(librosa.load(audio, sr=16000)[0]) audio_vec = torch.cat((audio_vec, wav)) speech_probs = [] window_size_samples = 1600 for i in range(0, len(wav), window_size_samples): chunk = wav[i: i + window_size_samples] if len(chunk) < window_size_samples: break speech_prob = vad(chunk, 16000).item() speech_probs.append(speech_prob) vad_iterator.reset_states() sample_per_sec = 16000 / window_size_samples x.extend([j + i / sample_per_sec for i in range(len(speech_probs))]) y.extend(speech_probs) j = max(x) fig = px.line(x=x, y=y) mean_speech_probs = mean(speech_probs) if wav.shape[0] > 16000 * 30: start = 0 end = 16000 * 30 chunk = wav[start:end] chunk_idx = 0 while end < wav.shape[0]: transcribe += transcribe_chunk(chunk) chunk_idx += 1 start = chunk_idx * 30 * 16000 if start >= wav.shape[0]: break end = (chunk_idx + 1) * 30 * 16000 if end >= wav.shape[0]: end = wav.shape[0] - 1 chunk = wav[start:end] else: transcribe += transcribe_chunk(wav, mean_speech_probs) curr_lang = '' if len(languages) > 0: curr_lang = mode(languages) print(curr_lang, languages) if curr_lang == 'iw': return 'he', fig, gr.update(visible=True), transcribe, gr.update(visible=True), gr.update(visible=True) return curr_lang, fig, gr.update(visible=True), transcribe, gr.update(visible=True), gr.update(visible=True) block = gr.Blocks(css=css) def play_sound(): global audio_vec import soundfile as sf print(audio_vec) sf.write('uploaded.wav', data=audio_vec, samplerate=16000) from pygame import mixer mixer.init() mixer.music.load('uploaded.wav') mixer.music.play() def change_audio(string): # if string == 'סטרימינג': # return gr.Audio.update(source="microphone",), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) # else: # return gr.Audio.update(source='upload'), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) if string == 'סטרימינג': return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), \ gr.update(visible=False), gr.update(visible=False) elif string == 'הקלטה': print('in mesholav') return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \ gr.update(visible=True), gr.update(visible=True) else: return gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), \ gr.update(visible=False), gr.update(visible=False) with block: gr.HTML( """

Whisper

""" ) with gr.Group(): plot = gr.Plot(show_label=False, visible=False) with gr.Row(equal_height=True): with gr.Box(): radio = gr.Radio(["סטרימינג", "הקלטה", "קובץ"], label="?איך תרצה לספק את האודיו") with gr.Row().style(mobile_collapse=False, equal_height=True): audio = gr.Audio( show_label=False, source="microphone", type="filepath", visible=True ) audio2 = gr.Audio( label="Input Audio", show_label=False, source="upload", type="filepath", visible=False ) audio3 = gr.Audio( label="Input Audio", show_label=False, source="microphone", type="filepath", visible=False ) trans_btn = gr.Button("Transcribe", visible=False) trans_btn3 = gr.Button("Transcribe", visible=False) text = gr.Textbox(show_label=False, elem_id="result-textarea") text2 = gr.Textbox(show_label=False, elem_id="result-textarea") with gr.Row(): clear_btn = gr.Button("Clear", visible=False) play_btn = gr.Button('Play audio', visible=False) radio.change(fn=change_audio, inputs=radio, outputs=[audio, trans_btn, audio2, trans_btn3, audio3]) trans_btn.click(inference_file, audio2, [text, plot, plot, text2, clear_btn, play_btn]) trans_btn3.click(inference_file, audio3, [text, plot, plot, text2, clear_btn, play_btn]) audio.stream(inference_file, audio, [text, plot, plot, text2, clear_btn, play_btn]) play_btn.click(play_sound) clear_btn.click(clear, inputs=[], outputs=[text, plot, plot, text2, clear_btn, play_btn]) gr.HTML(''' ''') gr.HTML(''' Girl in a jacket ''') global thread thread = Thread(target=transcribe_chunk) block.queue().launch()