Spaces:

Shiry
/

VAD_LM_ASR

Runtime error

App Files Files Community

Shiry commited on Dec 20, 2022

Commit

050271c

•

1 Parent(s): fc9008d

Add application file

Browse files

Files changed (2) hide show

app.py +390 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import torch
+# os.system("pip install git+https://github.com/openai/whisper.git")
+import gradio as gr
+import whisper
+import librosa
+import plotly.express as px
+from threading import Thread
+from statistics import mode, mean
+import time
+model = whisper.load_model("large", device='cpu')
+print('loaded whisper')
+vad, vad_utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                                model='silero_vad',
+                                force_reload=False,
+                                onnx=False)
+print('loaded silero')
+(get_speech_timestamps,
+ save_audio,
+ read_audio,
+ VADIterator,
+ collect_chunks) = vad_utils
+vad_iterator = VADIterator(vad)
+global x, y, j, audio_vec, transcribe, STOP, languages, not_detected, main_lang, STARTED
+x = []
+y = []
+j = 0
+STOP = False
+audio_vec = torch.tensor([])
+transcribe = ''
+languages = []
+not_detected = True
+main_lang = ''
+STARTED = False
+css = """
+        .gradio-container {
+            font-family: 'IBM Plex Sans', sans-serif;
+        }
+        .gr-button {
+            color: white;
+            border-color: black;
+            background: black;
+        }
+        input[type='range'] {
+            accent-color: black;
+        }
+        .dark input[type='range'] {
+            accent-color: #dfdfdf;
+        }
+        .container {
+            max-width: 730px;
+            margin: auto;
+            padding-top: 1.5rem;
+        }
+        .details:hover {
+            text-decoration: underline;
+        }
+        .gr-button {
+            white-space: nowrap;
+        }
+        .gr-button:focus {
+            border-color: rgb(147 197 253 / var(--tw-border-opacity));
+            outline: none;
+            box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
+            --tw-border-opacity: 1;
+            --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
+            --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
+            --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
+            --tw-ring-opacity: .5;
+        }
+        .footer {
+            margin-bottom: 45px;
+            margin-top: 35px;
+            text-align: center;
+            border-bottom: 1px solid #e5e5e5;
+        }
+        .footer>p {
+            font-size: .8rem;
+            display: inline-block;
+            padding: 0 10px;
+            transform: translateY(10px);
+            background: white;
+        }
+        .dark .footer {
+            border-color: #303030;
+        }
+        .dark .footer>p {
+            background: #0b0f19;
+        }
+        .prompt h4{
+            margin: 1.25em 0 .25em 0;
+            font-weight: bold;
+            font-size: 115%;
+        }
+        .animate-spin {
+            animation: spin 1s linear infinite;
+        }
+        @keyframes spin {
+            from {
+                transform: rotate(0deg);
+            }
+            to {
+                transform: rotate(360deg);
+            }
+        }
+        #share-btn-container {
+            display: flex; margin-top: 1.5rem !important; padding-left: 0.5rem !important; padding-right: 0.5rem
+            !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px
+            !important; width: 13rem;
+        }
+        #share-btn {
+            all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif;
+            margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;
+        }
+        #share-btn * {
+            all: unset;
+        }
+"""
+# def transcribe_chunk():
+#     print('********************************')
+#     global audio_vec, transcribe, STOP
+#     print('Enter trans chunk')
+#     counter = 0
+#     i = 0
+#     while not STOP:
+#         if audio_vec.size()[0] // 32000 > counter and audio_vec.size()[0] > 0:
+#             print('audio_vec.size()[0] % 32000', audio_vec.size()[0] % 32000)
+#             print('audio size', audio_vec.size()[0])
+#             chunk = whisper.pad_or_trim(audio_vec[32000*counter: 32000*(counter + 1)])
+#             mel_th = whisper.log_mel_spectrogram(chunk).to(model.device)
+#             options = whisper.DecodingOptions(fp16=False)
+#             result = whisper.decode(model, mel_th, options)
+#             no_speech_prob = result.no_speech_prob
+#             if no_speech_prob < 0.4:
+#                 transcribe += result.text + ' '
+#             counter += 1
+def transcribe_chunk(audio, vad_prob):
+    global languages
+    trnscrb = ''
+    audio = whisper.pad_or_trim(audio)
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    options = whisper.DecodingOptions(fp16= False, task='transcribe')
+    result = whisper.decode(model, mel, options)
+    no_speech_prob = result.no_speech_prob
+    mel = whisper.log_mel_spectrogram(audio).to(model.device)
+    _, probs = model.detect_language(mel)
+    temp_lang = max(probs, key=probs.get)
+    print(result.text, "no_speech_prob: ",no_speech_prob, 1 - vad_prob)
+    if no_speech_prob < 0.6:
+        trnscrb = result.text + ' '
+        languages.append(temp_lang)
+        if len(languages) > 3:
+            languages.pop(0)
+    return trnscrb
+def inference(audio):
+    global x, y, j, audio_vec, transcribe, languages, not_detected, main_lang, STARTED
+    print('enter inference')
+    if j == 0:
+        thread.start()
+        STARTED = True
+    wav2 = whisper.load_audio(audio, sr=16000)
+    wav = torch.from_numpy(librosa.load(audio, sr=16000)[0])
+    audio_vec = torch.cat((audio_vec, wav))
+    speech_probs = []
+    window_size_samples = 1600
+    for i in range(0, len(wav), window_size_samples):
+        chunk = wav[i: i + window_size_samples]
+        if len(chunk) < window_size_samples:
+            break
+        speech_prob = vad(chunk, 16000).item()
+        speech_probs.append(speech_prob)
+    vad_iterator.reset_states()
+    sample_per_sec = 16000 / window_size_samples
+    x.extend([j + i / sample_per_sec for i in range(len(speech_probs))])
+    y.extend(speech_probs)
+    j = max(x)
+    fig = px.line(x=x, y=y)
+    whisper_audio = whisper.pad_or_trim(wav2)
+    mel = whisper.log_mel_spectrogram(whisper_audio).to(model.device)
+    _, probs = model.detect_language(mel)
+    temp_lang = max(probs, key=probs.get)
+    print(temp_lang)
+    languages.append(temp_lang)
+    if len(languages) > 5:
+        languages.pop(0)
+    curr_lang = mode(languages)
+    print(curr_lang, languages)
+    if curr_lang == 'iw':
+        return 'he', fig, gr.update(visible=True), transcribe, gr.update(visible=True), gr.update(visible=True)
+    return curr_lang, fig, gr.update(visible=True), transcribe, gr.update(visible=True), gr.update(visible=True)
+def clear():
+    global x, y, j, audio_vec, transcribe, thread, STOP, languages, main_lang, not_detected ,STARTED
+    STOP = True
+    if STARTED:
+        thread.join()
+    STARTED = False
+    x = []
+    y = []
+    j = 0
+    audio_vec = torch.tensor([])
+    transcribe = ''
+    STOP = False
+    languages = []
+    main_lang = ''
+    not_detected = True
+    thread = Thread(target=transcribe_chunk)
+    print('clean:', x, y, j, transcribe, audio_vec)
+    return '', gr.update(visible=False), gr.update(visible=False), '', gr.update(visible=False), gr.update(visible=False),
+def inference_file(audio):
+    time.sleep(0.8)
+    global x, y, j, audio_vec, transcribe, languages, not_detected, main_lang
+    wav = torch.from_numpy(librosa.load(audio, sr=16000)[0])
+    audio_vec = torch.cat((audio_vec, wav))
+    speech_probs = []
+    window_size_samples = 1600
+    for i in range(0, len(wav), window_size_samples):
+        chunk = wav[i: i + window_size_samples]
+        if len(chunk) < window_size_samples:
+            break
+        speech_prob = vad(chunk, 16000).item()
+        speech_probs.append(speech_prob)
+    vad_iterator.reset_states()
+    sample_per_sec = 16000 / window_size_samples
+    x.extend([j + i / sample_per_sec for i in range(len(speech_probs))])
+    y.extend(speech_probs)
+    j = max(x)
+    fig = px.line(x=x, y=y)
+    mean_speech_probs = mean(speech_probs)
+    if wav.shape[0] > 16000 * 30:
+        start = 0
+        end = 16000 * 30
+        chunk = wav[start:end]
+        chunk_idx = 0
+        while end < wav.shape[0]:
+            transcribe += transcribe_chunk(chunk)
+            chunk_idx += 1
+            start = chunk_idx * 30 * 16000
+            if start >= wav.shape[0]:
+                break
+            end = (chunk_idx + 1) * 30 * 16000
+            if end >= wav.shape[0]:
+                end = wav.shape[0] - 1
+                chunk = wav[start:end]
+    else:
+        transcribe += transcribe_chunk(wav, mean_speech_probs)
+    curr_lang = ''
+    if len(languages) > 0:
+        curr_lang = mode(languages)
+        print(curr_lang, languages)
+    if curr_lang == 'iw':
+        return 'he', fig, gr.update(visible=True), transcribe, gr.update(visible=True), gr.update(visible=True)
+    return curr_lang, fig, gr.update(visible=True), transcribe, gr.update(visible=True), gr.update(visible=True)
+block = gr.Blocks(css=css)
+def play_sound():
+    global audio_vec
+    import soundfile as sf
+    print(audio_vec)
+    sf.write('uploaded.wav', data=audio_vec, samplerate=16000)
+    from pygame import mixer
+    mixer.init()
+    mixer.music.load('uploaded.wav')
+    mixer.music.play()
+def change_audio(string):
+    # if string == 'סטרימינג':
+    #     return gr.Audio.update(source="microphone",), gr.update(visible=False),  gr.update(visible=False), gr.update(visible=False)
+    # else:
+    #     return gr.Audio.update(source='upload'), gr.update(visible=True),  gr.update(visible=False), gr.update(visible=False)
+    if string == 'סטרימינג':
+        return gr.update(visible=True),  gr.update(visible=False), gr.update(visible=False), \
+            gr.update(visible=False), gr.update(visible=False)
+    elif string == 'הקלטה':
+        print('in mesholav')
+        return gr.update(visible=False),  gr.update(visible=False), gr.update(visible=False), \
+            gr.update(visible=True), gr.update(visible=True)
+    else:
+        return gr.update(visible=False),  gr.update(visible=True), gr.update(visible=True), \
+            gr.update(visible=False), gr.update(visible=False)
+with block:
+    gr.HTML(
+        """
+            <div style="text-align: center; max-width: 650px; margin: 0 auto;">
+              <div
+                style="
+                  display: inline-flex;
+                  align-items: center;
+                  gap: 0.8rem;
+                  font-size: 1.75rem;
+                "
+              >
+                <h1 style="font-weight: 900; margin-bottom: 7px;">
+                  Whisper
+                </h1>
+              </div>
+            </div>
+        """
+    )
+    with gr.Group():
+        plot = gr.Plot(show_label=False, visible=False)
+        with gr.Row(equal_height=True):
+            with gr.Box():
+                radio = gr.Radio(["סטרימינג", "הקלטה", "קובץ"], label="?איך תרצה לספק את האודיו")
+                with gr.Row().style(mobile_collapse=False, equal_height=True):
+                    audio = gr.Audio(
+                        show_label=False,
+                        source="microphone",
+                        type="filepath",
+                        visible=True
+                    )
+                    audio2 = gr.Audio(
+                        label="Input Audio",
+                        show_label=False,
+                        source="upload",
+                        type="filepath",
+                        visible=False
+                    )
+                    audio3 = gr.Audio(
+                        label="Input Audio",
+                        show_label=False,
+                        source="microphone",
+                        type="filepath",
+                        visible=False
+                    )
+                    trans_btn = gr.Button("Transcribe", visible=False)
+                    trans_btn3 = gr.Button("Transcribe", visible=False)
+        text = gr.Textbox(show_label=False, elem_id="result-textarea")
+        text2 = gr.Textbox(show_label=False, elem_id="result-textarea")
+        with gr.Row():
+            clear_btn = gr.Button("Clear", visible=False)
+            play_btn = gr.Button('Play audio', visible=False)
+        radio.change(fn=change_audio, inputs=radio, outputs=[audio, trans_btn, audio2, trans_btn3, audio3])
+        trans_btn.click(inference_file, audio2, [text, plot, plot, text2, clear_btn, play_btn])
+        trans_btn3.click(inference_file, audio3, [text, plot, plot, text2, clear_btn, play_btn])
+        audio.stream(inference_file, audio, [text, plot, plot, text2, clear_btn, play_btn])
+        play_btn.click(play_sound)
+        clear_btn.click(clear, inputs=[], outputs=[text, plot, plot, text2, clear_btn, play_btn])
+        gr.HTML('''
+        <div class="footer">
+                    <p>Model by Moses team - Whisper Demo
+                    </p>
+        </div>
+        ''')
+        gr.HTML('''
+           <img style="text-align: center; max-width: 650px; margin: 0 auto;" src="https://geekflare.com/wp-content/uploads/2022/02/speechrecognitionapi.png", alt="Girl in a jacket" width="500" height="600">
+        ''')
+global thread
+thread = Thread(target=transcribe_chunk)
+block.queue().launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+git+https://github.com/huggingface/transformers
+torch
+git+https://github.com/openai/whisper.git