Spaces:

pustozerov
/

poc_call_transcription

Build error

App Files Files Community

pustozerov commited on Oct 22, 2022

Commit

e053abc

•

1 Parent(s): cd64b5d

Created and tested an alternative gradio GUI for the app.

Browse files

Files changed (4) hide show

.gitignore +0 -1
.idea/PoCCallTranscription.iml +3 -0
app_gradio.py +102 -0
requirements.txt +9 -6

.gitignore CHANGED Viewed

@@ -2,5 +2,4 @@
 /data/database/
 /info/configs/manifests/
 /info/transcripts/
-/data/user_data/
 /data/user_data_wav/

 /data/database/
 /info/configs/manifests/
 /info/transcripts/
 /data/user_data_wav/

.idea/PoCCallTranscription.iml CHANGED Viewed

@@ -8,4 +8,7 @@
     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>

     <orderEntry type="inheritedJdk" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
+  <component name="PackageRequirementsSettings">
+    <option name="versionSpecifier" value="Don't specify version" />
+  </component>
 </module>

app_gradio.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import shutil
+import gradio as gr
+import random
+import os
+import numpy as np
+from pydub import AudioSegment
+from datasets import load_dataset
+from scipy.io.wavfile import write
+from modules.diarization.nemo_diarization import diarization
+from modules.nlp.nemo_ner import detect_ner
+from modules.nlp.nemo_punct_cap import punctuation_capitalization
+FOLDER_WAV_DB = "data/database/"
+FOLDER_USER_DATA = "data/user_data/"
+FOLDER_USER_DATA_WAV = "data/user_data_wav/"
+FOLDER_MANIFESTS = "info/configs/manifests/"
+SAMPLE_RATE = 16000
+dataset = load_dataset("pustozerov/crema_d_diarization", split='validation')
+os.makedirs(FOLDER_WAV_DB, exist_ok=True)
+os.makedirs(FOLDER_MANIFESTS, exist_ok=True)
+def process_audio(uploaded_file=None):
+    if uploaded_file:
+        secondary_audio = False
+        folder_wav = FOLDER_USER_DATA_WAV
+        os.makedirs(folder_wav, exist_ok=True)
+        print(uploaded_file)
+        shutil.move(uploaded_file, os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file)))
+        uploaded_file = os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file))
+        print(uploaded_file)
+        if ".mp3" in uploaded_file:
+            sound = AudioSegment.from_mp3(uploaded_file)
+        elif ".ogg" in uploaded_file:
+            sound = AudioSegment.from_ogg(uploaded_file)
+        else:
+            sound = AudioSegment.from_wav(uploaded_file)
+        save_path = folder_wav + os.path.basename(uploaded_file)
+        os.makedirs(folder_wav, exist_ok=True)
+        sound.export(save_path, format="wav", parameters=["-ac", "1"])
+        file_name = os.path.basename(save_path).split(".")[0]
+        result = diarization(save_path)
+    else:
+        secondary_audio = True
+        folder_wav = FOLDER_WAV_DB
+        os.makedirs(folder_wav, exist_ok=True)
+        shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100))
+        file_name = str(shuffled_dataset["file"][0]).split(".")[0]
+        audio_bytes = np.array(shuffled_dataset["data"][0])
+        audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767)
+        write(os.path.join(folder_wav, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled)
+        result = diarization(os.path.join(folder_wav, file_name + '.wav'))
+    transcript_path = "info/transcripts/pred_rttms/" + file_name + ".txt"
+    with open(transcript_path) as f:
+        transcript = f.read()
+    sentences = result[file_name]["sentences"]
+    all_strings = ""
+    for sentence in sentences:
+        all_strings = all_strings + sentence["sentence"] + "\n"
+    all_strings = punctuation_capitalization([all_strings])[0]
+    tagged_string, tags_summary = detect_ner(all_strings)
+    transcript = transcript + '\n' + tagged_string
+    with open(transcript_path, 'w') as f:
+        f.write(transcript)
+    output = "<p>Number of speakers: %s" % result[file_name]["speaker_count"] + "<br>" \
+             + "Sentences: %s" % len(result[file_name]["sentences"]) + "<br>" \
+             + "Words: %s" % len(result[file_name]["words"]) + "<br>" \
+             + "Found named entities: %s" % tags_summary + "</p>"
+    return [audio_output.update(os.path.join(folder_wav, file_name + '.wav'), visible=secondary_audio),
+            output, file_output.update(transcript_path, visible=True)]
+with gr.Blocks() as demo:
+    gr.HTML('<br><h1><font size="+4">Call Transcription demo</font></h1>')
+    gr.HTML('<p><font size="+1">This simple demo shows the possibilities of ASR and NLP in the task of automatic '
+            'speech recognition '
+            'and diarization. It works with mp3, ogg, and wav files. You can randomly pick an audio file with the '
+            'dialogue from the built-in database or try uploading your files.</font></p>')
+    gr.Markdown('<p><font size="+1">Note: this demo shows up a reduced-performance model. To get a full-performance '
+                'neural network or '
+                'develop a system adapted to your task – contact <a '
+                'href="mailto:kirill.lozovoi@exposit.com?subject=Request for '
+                'information">kirill.lozovoi@exposit.com</a>.</font></p>')
+    audio_input = gr.Audio(source="upload", type="filepath")
+    second_btn = gr.Button('Try uploaded audiofile')
+    gr.Markdown('<center><p>or</p></center>')
+    first_btn = gr.Button('Try a random sample from the database')
+    # Output zone
+    audio_output = gr.Audio(visible=False, interactive=True)
+    text_output = gr.HTML()
+    file_output = gr.File(label="Download audio transcript", visible=False)
+    # noinspection PyTypeChecker
+    first_btn.click(fn=process_audio, inputs=None,
+                    outputs=[audio_output, text_output, file_output])
+    # noinspection PyTypeChecker
+    second_btn.click(fn=process_audio, inputs=audio_input, outputs=[audio_output, text_output, file_output])
+demo.launch(share=True)

requirements.txt CHANGED Viewed

@@ -13,7 +13,7 @@ kenlm @ https://github.com/kpu/kenlm/archive/master.zip
 librosa==0.9.2
 mecab-python3==1.0.5
 nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
-omegaconf==2.1.2
 OpenCC
 pangu==4.0.6.1
 praat-parselmouth==0.4.1
@@ -25,21 +25,24 @@ pyannote.database==4.1.3
 pyannote.metrics==3.2
 pyannote.pipeline==2.3
 pyctcdecode==0.3.0
-pydub==0.25.1
 pynini
 pytorch-lightning==1.6.5
 sacrebleu==2.1.0
 sacremoses==0.0.53
 sentencepiece==0.1.96
-SoundFile==0.10.3.post1
 spacy==3.4.0
 speechbrain @ git+https://github.com/speechbrain/speechbrain.git
-streamlit~=1.11.1
 torch==1.12.0
 torchaudio==0.12.0
 transformers==4.20.0
 webdataset==0.1.62
 Cython==0.29.14
 youtokentome
-datasets~=2.4.0
-NEMO~=4.1.1

 librosa==0.9.2
 mecab-python3==1.0.5
 nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
+omegaconf
 OpenCC
 pangu==4.0.6.1
 praat-parselmouth==0.4.1
 pyannote.metrics==3.2
 pyannote.pipeline==2.3
 pyctcdecode==0.3.0
+pydub
 pynini
 pytorch-lightning==1.6.5
 sacrebleu==2.1.0
 sacremoses==0.0.53
 sentencepiece==0.1.96
+SoundFile
 spacy==3.4.0
 speechbrain @ git+https://github.com/speechbrain/speechbrain.git
+streamlit
 torch==1.12.0
 torchaudio==0.12.0
 transformers==4.20.0
 webdataset==0.1.62
 Cython==0.29.14
 youtokentome
+datasets
+NEMO
+numpy
+scipy
+gradio