pustozerov
commited on
Commit
•
e053abc
1
Parent(s):
cd64b5d
Created and tested an alternative gradio GUI for the app.
Browse files- .gitignore +0 -1
- .idea/PoCCallTranscription.iml +3 -0
- app_gradio.py +102 -0
- requirements.txt +9 -6
.gitignore
CHANGED
@@ -2,5 +2,4 @@
|
|
2 |
/data/database/
|
3 |
/info/configs/manifests/
|
4 |
/info/transcripts/
|
5 |
-
/data/user_data/
|
6 |
/data/user_data_wav/
|
|
|
2 |
/data/database/
|
3 |
/info/configs/manifests/
|
4 |
/info/transcripts/
|
|
|
5 |
/data/user_data_wav/
|
.idea/PoCCallTranscription.iml
CHANGED
@@ -8,4 +8,7 @@
|
|
8 |
<orderEntry type="inheritedJdk" />
|
9 |
<orderEntry type="sourceFolder" forTests="false" />
|
10 |
</component>
|
|
|
|
|
|
|
11 |
</module>
|
|
|
8 |
<orderEntry type="inheritedJdk" />
|
9 |
<orderEntry type="sourceFolder" forTests="false" />
|
10 |
</component>
|
11 |
+
<component name="PackageRequirementsSettings">
|
12 |
+
<option name="versionSpecifier" value="Don't specify version" />
|
13 |
+
</component>
|
14 |
</module>
|
app_gradio.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import shutil
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import random
|
5 |
+
import os
|
6 |
+
import numpy as np
|
7 |
+
from pydub import AudioSegment
|
8 |
+
from datasets import load_dataset
|
9 |
+
from scipy.io.wavfile import write
|
10 |
+
|
11 |
+
from modules.diarization.nemo_diarization import diarization
|
12 |
+
from modules.nlp.nemo_ner import detect_ner
|
13 |
+
from modules.nlp.nemo_punct_cap import punctuation_capitalization
|
14 |
+
|
15 |
+
FOLDER_WAV_DB = "data/database/"
|
16 |
+
FOLDER_USER_DATA = "data/user_data/"
|
17 |
+
FOLDER_USER_DATA_WAV = "data/user_data_wav/"
|
18 |
+
FOLDER_MANIFESTS = "info/configs/manifests/"
|
19 |
+
SAMPLE_RATE = 16000
|
20 |
+
dataset = load_dataset("pustozerov/crema_d_diarization", split='validation')
|
21 |
+
os.makedirs(FOLDER_WAV_DB, exist_ok=True)
|
22 |
+
os.makedirs(FOLDER_MANIFESTS, exist_ok=True)
|
23 |
+
|
24 |
+
|
25 |
+
def process_audio(uploaded_file=None):
|
26 |
+
if uploaded_file:
|
27 |
+
secondary_audio = False
|
28 |
+
folder_wav = FOLDER_USER_DATA_WAV
|
29 |
+
os.makedirs(folder_wav, exist_ok=True)
|
30 |
+
print(uploaded_file)
|
31 |
+
shutil.move(uploaded_file, os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file)))
|
32 |
+
uploaded_file = os.path.join(FOLDER_USER_DATA, os.path.basename(uploaded_file))
|
33 |
+
print(uploaded_file)
|
34 |
+
if ".mp3" in uploaded_file:
|
35 |
+
sound = AudioSegment.from_mp3(uploaded_file)
|
36 |
+
elif ".ogg" in uploaded_file:
|
37 |
+
sound = AudioSegment.from_ogg(uploaded_file)
|
38 |
+
else:
|
39 |
+
sound = AudioSegment.from_wav(uploaded_file)
|
40 |
+
save_path = folder_wav + os.path.basename(uploaded_file)
|
41 |
+
os.makedirs(folder_wav, exist_ok=True)
|
42 |
+
sound.export(save_path, format="wav", parameters=["-ac", "1"])
|
43 |
+
file_name = os.path.basename(save_path).split(".")[0]
|
44 |
+
result = diarization(save_path)
|
45 |
+
else:
|
46 |
+
secondary_audio = True
|
47 |
+
folder_wav = FOLDER_WAV_DB
|
48 |
+
os.makedirs(folder_wav, exist_ok=True)
|
49 |
+
shuffled_dataset = dataset.shuffle(seed=random.randint(0, 100))
|
50 |
+
file_name = str(shuffled_dataset["file"][0]).split(".")[0]
|
51 |
+
audio_bytes = np.array(shuffled_dataset["data"][0])
|
52 |
+
audio_bytes_scaled = np.int16(audio_bytes / np.max(np.abs(audio_bytes)) * 32767)
|
53 |
+
write(os.path.join(folder_wav, file_name + '.wav'), rate=SAMPLE_RATE, data=audio_bytes_scaled)
|
54 |
+
result = diarization(os.path.join(folder_wav, file_name + '.wav'))
|
55 |
+
transcript_path = "info/transcripts/pred_rttms/" + file_name + ".txt"
|
56 |
+
with open(transcript_path) as f:
|
57 |
+
transcript = f.read()
|
58 |
+
sentences = result[file_name]["sentences"]
|
59 |
+
all_strings = ""
|
60 |
+
for sentence in sentences:
|
61 |
+
all_strings = all_strings + sentence["sentence"] + "\n"
|
62 |
+
all_strings = punctuation_capitalization([all_strings])[0]
|
63 |
+
tagged_string, tags_summary = detect_ner(all_strings)
|
64 |
+
transcript = transcript + '\n' + tagged_string
|
65 |
+
with open(transcript_path, 'w') as f:
|
66 |
+
f.write(transcript)
|
67 |
+
output = "<p>Number of speakers: %s" % result[file_name]["speaker_count"] + "<br>" \
|
68 |
+
+ "Sentences: %s" % len(result[file_name]["sentences"]) + "<br>" \
|
69 |
+
+ "Words: %s" % len(result[file_name]["words"]) + "<br>" \
|
70 |
+
+ "Found named entities: %s" % tags_summary + "</p>"
|
71 |
+
return [audio_output.update(os.path.join(folder_wav, file_name + '.wav'), visible=secondary_audio),
|
72 |
+
output, file_output.update(transcript_path, visible=True)]
|
73 |
+
|
74 |
+
|
75 |
+
with gr.Blocks() as demo:
|
76 |
+
gr.HTML('<br><h1><font size="+4">Call Transcription demo</font></h1>')
|
77 |
+
gr.HTML('<p><font size="+1">This simple demo shows the possibilities of ASR and NLP in the task of automatic '
|
78 |
+
'speech recognition '
|
79 |
+
'and diarization. It works with mp3, ogg, and wav files. You can randomly pick an audio file with the '
|
80 |
+
'dialogue from the built-in database or try uploading your files.</font></p>')
|
81 |
+
gr.Markdown('<p><font size="+1">Note: this demo shows up a reduced-performance model. To get a full-performance '
|
82 |
+
'neural network or '
|
83 |
+
'develop a system adapted to your task – contact <a '
|
84 |
+
'href="mailto:kirill.lozovoi@exposit.com?subject=Request for '
|
85 |
+
'information">kirill.lozovoi@exposit.com</a>.</font></p>')
|
86 |
+
audio_input = gr.Audio(source="upload", type="filepath")
|
87 |
+
second_btn = gr.Button('Try uploaded audiofile')
|
88 |
+
gr.Markdown('<center><p>or</p></center>')
|
89 |
+
first_btn = gr.Button('Try a random sample from the database')
|
90 |
+
|
91 |
+
# Output zone
|
92 |
+
audio_output = gr.Audio(visible=False, interactive=True)
|
93 |
+
text_output = gr.HTML()
|
94 |
+
file_output = gr.File(label="Download audio transcript", visible=False)
|
95 |
+
|
96 |
+
# noinspection PyTypeChecker
|
97 |
+
first_btn.click(fn=process_audio, inputs=None,
|
98 |
+
outputs=[audio_output, text_output, file_output])
|
99 |
+
# noinspection PyTypeChecker
|
100 |
+
second_btn.click(fn=process_audio, inputs=audio_input, outputs=[audio_output, text_output, file_output])
|
101 |
+
|
102 |
+
demo.launch(share=True)
|
requirements.txt
CHANGED
@@ -13,7 +13,7 @@ kenlm @ https://github.com/kpu/kenlm/archive/master.zip
|
|
13 |
librosa==0.9.2
|
14 |
mecab-python3==1.0.5
|
15 |
nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
|
16 |
-
omegaconf
|
17 |
OpenCC
|
18 |
pangu==4.0.6.1
|
19 |
praat-parselmouth==0.4.1
|
@@ -25,21 +25,24 @@ pyannote.database==4.1.3
|
|
25 |
pyannote.metrics==3.2
|
26 |
pyannote.pipeline==2.3
|
27 |
pyctcdecode==0.3.0
|
28 |
-
pydub
|
29 |
pynini
|
30 |
pytorch-lightning==1.6.5
|
31 |
sacrebleu==2.1.0
|
32 |
sacremoses==0.0.53
|
33 |
sentencepiece==0.1.96
|
34 |
-
SoundFile
|
35 |
spacy==3.4.0
|
36 |
speechbrain @ git+https://github.com/speechbrain/speechbrain.git
|
37 |
-
streamlit
|
38 |
torch==1.12.0
|
39 |
torchaudio==0.12.0
|
40 |
transformers==4.20.0
|
41 |
webdataset==0.1.62
|
42 |
Cython==0.29.14
|
43 |
youtokentome
|
44 |
-
datasets
|
45 |
-
NEMO
|
|
|
|
|
|
|
|
13 |
librosa==0.9.2
|
14 |
mecab-python3==1.0.5
|
15 |
nemo-toolkit @ git+https://github.com/NVIDIA/NeMo.git@6442e339a47d30a106d869d1ef29cc1294753b75
|
16 |
+
omegaconf
|
17 |
OpenCC
|
18 |
pangu==4.0.6.1
|
19 |
praat-parselmouth==0.4.1
|
|
|
25 |
pyannote.metrics==3.2
|
26 |
pyannote.pipeline==2.3
|
27 |
pyctcdecode==0.3.0
|
28 |
+
pydub
|
29 |
pynini
|
30 |
pytorch-lightning==1.6.5
|
31 |
sacrebleu==2.1.0
|
32 |
sacremoses==0.0.53
|
33 |
sentencepiece==0.1.96
|
34 |
+
SoundFile
|
35 |
spacy==3.4.0
|
36 |
speechbrain @ git+https://github.com/speechbrain/speechbrain.git
|
37 |
+
streamlit
|
38 |
torch==1.12.0
|
39 |
torchaudio==0.12.0
|
40 |
transformers==4.20.0
|
41 |
webdataset==0.1.62
|
42 |
Cython==0.29.14
|
43 |
youtokentome
|
44 |
+
datasets
|
45 |
+
NEMO
|
46 |
+
numpy
|
47 |
+
scipy
|
48 |
+
gradio
|