Mohannad commited on
Commit
5a3dbe8
1 Parent(s): c71de1b

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +190 -0
  2. requirements.txt +24 -0
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from faster_whisper import WhisperModel
3
+ import datetime
4
+ import subprocess
5
+ from pathlib import Path
6
+ import pandas as pd
7
+ import re
8
+ import time
9
+ import os
10
+ import numpy as np
11
+ from sklearn.cluster import AgglomerativeClustering
12
+ from sklearn.metrics import silhouette_score
13
+
14
+ import torch
15
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
16
+ from pyannote.audio import Audio
17
+ from pyannote.core import Segment
18
+
19
+
20
+ import wave
21
+ import contextlib
22
+ from transformers import pipeline
23
+ from huggingface_hub import hf_hub_download
24
+ from transformers import AutoTokenizer
25
+ import onnxruntime
26
+ import numpy as np
27
+ import librosa
28
+
29
+ whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
30
+ source_languages = {"en": "English"}
31
+
32
+ MODEL_NAME = "vumichien/whisper-medium-jp"
33
+ lang = "en"
34
+
35
+ device = 0 if torch.cuda.is_available() else "cpu"
36
+
37
+ embedding_model = PretrainedSpeakerEmbedding(
38
+ "speechbrain/spkrec-ecapa-voxceleb",
39
+ device=torch.device("cuda"))
40
+
41
+ def segment_embedding(segment, duration, audio_file):
42
+ audio = Audio()
43
+ start = segment["start"]
44
+ # Whisper overshoots the end timestamp in the last segment
45
+ end = min(duration, segment["end"])
46
+ clip = Segment(start, end)
47
+ waveform, sample_rate = audio.crop(audio_file, clip)
48
+ return embedding_model(waveform[None])
49
+
50
+ def fast_whisper(audio_file, model):
51
+ # Transcribe audio
52
+ options = dict(language=lang, beam_size=5, best_of=5)
53
+ transcribe_options = dict(task="transcribe", **options)
54
+ segments_raw, info = model.transcribe(audio_file, **transcribe_options)
55
+
56
+ # Convert back to original openai format
57
+ segments = []
58
+ i = 0
59
+ for segment_chunk in segments_raw:
60
+ chunk = {}
61
+ chunk["start"] = segment_chunk.start
62
+ chunk["end"] = segment_chunk.end
63
+ chunk["text"] = segment_chunk.text
64
+ segments.append(chunk)
65
+ i += 1
66
+ print("transcribe audio done with fast whisper")
67
+
68
+ return segments
69
+
70
+ def get_embeddings(segments, duration, audio_file):
71
+ embeddings = np.zeros(shape=(len(segments), 192))
72
+ for i, segment in enumerate(segments):
73
+ embeddings[i] = segment_embedding(segment, duration, audio_file)
74
+ embeddings = np.nan_to_num(embeddings)
75
+
76
+ print("Got embeddings for segments")
77
+ return embeddings
78
+
79
+ def get_n_speakers(embeddings, num_speakers):
80
+ if num_speakers == 0:
81
+ # Find the best number of speakers
82
+ score_num_speakers = {}
83
+
84
+ for num_speakers in range(2, 10+1):
85
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
86
+ score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
87
+ score_num_speakers[num_speakers] = score
88
+ best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
89
+ print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
90
+ else:
91
+ best_num_speaker = num_speakers
92
+
93
+ print(f"best num speakers is {best_num_speaker}")
94
+
95
+ return best_num_speaker
96
+
97
+ def assign_speaker(best_num_speaker, embeddings, segments):
98
+ # Assign speaker label
99
+ clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
100
+ labels = clustering.labels_
101
+ for i in range(len(segments)):
102
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
103
+
104
+ print(f"I know who said what now")
105
+ return segments
106
+
107
+ def convert_time(secs):
108
+ return datetime.timedelta(seconds=round(secs))
109
+
110
+ def segments2df(segments):
111
+ # Make output
112
+ objects = {
113
+ 'Start' : [],
114
+ 'End': [],
115
+ 'Speaker': [],
116
+ 'Text': []
117
+ }
118
+ text = ''
119
+ for (i, segment) in enumerate(segments):
120
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
121
+ objects['Start'].append(str(convert_time(segment["start"])))
122
+ objects['Speaker'].append(segment["speaker"])
123
+ if i != 0:
124
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
125
+ objects['Text'].append(text)
126
+ text = ''
127
+ text += segment["text"] + ' '
128
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
129
+ objects['Text'].append(text)
130
+
131
+ df_results = pd.DataFrame(objects)
132
+
133
+ return df_results
134
+
135
+
136
+ def speech_to_text(audio_file, whisper_model, num_speakers=0):
137
+ model = WhisperModel(whisper_model, compute_type="int8")
138
+ time_start = time.time()
139
+
140
+ if(audio_file == None): raise ValueError("Error no audio_file")
141
+
142
+ model = WhisperModel(whisper_model, compute_type="int8")
143
+ y, sr = librosa.load(audio_file)
144
+ duration = len(y)/sr
145
+ segments = fast_whisper(audio_file, model)
146
+ embeddings = get_embeddings(segments, duration, audio_file)
147
+ best_num_speaker = get_n_speakers(embeddings, num_speakers)
148
+ segments = assign_speaker(best_num_speaker, embeddings, segments)
149
+ diary = segments2df(segments)
150
+
151
+ return diary
152
+
153
+ onnx_path = hf_hub_download(repo_id='UKP-SQuARE/roberta-base-pf-boolq-onnx', filename='model.onnx') # or model_quant.onnx for quantization
154
+ onnx_model = onnxruntime.InferenceSession(onnx_path, providers=['CPUExecutionProvider'])
155
+
156
+ question = 'Can she answer'
157
+ tokenizer = AutoTokenizer.from_pretrained('UKP-SQuARE/roberta-base-pf-boolq-onnx')
158
+
159
+ def answer(context, question):
160
+ inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors='np')
161
+ inputs = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}
162
+ outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)
163
+
164
+ return outputs
165
+
166
+
167
+ uploaded_file = st.sidebar.file_uploader("Choose a file")
168
+ num_speakers = st.sidebar.slider("num speakers (0 means auto detect)", 0, 10, 0)
169
+ diary = None
170
+ question = None
171
+ if uploaded_file is not None:
172
+ filename = uploaded_file.name
173
+
174
+ if st.sidebar.checkbox('Get conversation'):
175
+ torch.cuda.empty_cache()
176
+ whisper_model = "base"
177
+ diary = speech_to_text(filename, whisper_model, num_speakers=num_speakers)
178
+
179
+ st.dataframe(diary.style.highlight_max(axis=0))
180
+
181
+
182
+ question = st.sidebar.text_input('Question', 'Can she answer')
183
+ if st.sidebar.button('Answer'):
184
+ diary["text_all"] = diary["Speaker"] + ": "+ diary["Text"]
185
+ context = " \n ".join(diary["text_all"].to_list())
186
+ outputs = answer(context, question)
187
+
188
+ outputs = outputs[0][0]
189
+ if outputs[0]>outputs[1]: st.sidebar.write("Answer is Yes")
190
+ if outputs[0]<outputs[1]: st.sidebar.write("Answer is No")
requirements.txt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ git+https://github.com/huggingface/transformers
3
+ git+https://github.com/pyannote/pyannote-audio
4
+ git+https://github.com/openai/whisper.git
5
+ gradio==3.12
6
+ ffmpeg-python
7
+ pandas==1.5.0
8
+ sacremoses
9
+ sentencepiece
10
+ tokenizers
11
+ torch
12
+ torchaudio
13
+ tqdm==4.64.1
14
+ EasyNMT==2.0.2
15
+ nltk
16
+ transformers
17
+ pysrt
18
+ psutil==5.9.2
19
+ requests
20
+ faster-whisper
21
+ huggingface_hub
22
+ onnxruntime
23
+ streamlit
24
+ librosa