salmanmapkar commited on
Commit
cb0113d
1 Parent(s): 85c1a67

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -233
app.py CHANGED
@@ -26,276 +26,277 @@ import numpy as np
26
 
27
  model = whisper.load_model("medium")
28
  embedding_model = PretrainedSpeakerEmbedding(
29
- "speechbrain/spkrec-ecapa-voxceleb",
30
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
  )
32
 
33
 
34
  __FILES = set()
35
-
36
 
37
  def CreateFile(filename):
38
- __FILES.add(filename)
39
- return filename
40
 
41
  def RemoveFile(filename):
42
- if (os.path.isfile(filename)):
43
- os.remove(filename)
44
 
45
  def RemoveAllFiles():
46
- for file in __FILES:
47
- if (os.path.isfile(file)):
48
- os.remove(file)
49
-
50
  def Transcribe(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
51
- SPEAKER_DICT = {}
52
- SPEAKERS = []
53
-
54
- def GetSpeaker(sp):
55
- speaker = sp
56
- if sp not in list(SPEAKER_DICT.keys()):
57
- if len(SPEAKERS):
58
- t = SPEAKERS.pop(0)
59
- SPEAKER_DICT[sp] = t
60
- speaker = SPEAKER_DICT[sp]
61
- else:
62
- speaker = SPEAKER_DICT[sp]
63
- return speaker
64
-
65
- def GenerateSpeakerDict(sp):
66
- global SPEAKERS
67
- SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
68
-
69
- def millisec(timeStr):
70
- spl = timeStr.split(":")
71
- s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
72
- return s
73
-
74
- def preprocess(audio):
75
- t1 = 0 * 1000
76
- t2 = 20 * 60 * 1000
77
- newAudio = AudioSegment.from_wav(audio)
78
- a = newAudio[t1:t2]
79
- spacermilli = 2000
80
- spacer = AudioSegment.silent(duration=spacermilli)
81
- newAudio = spacer.append(a, crossfade=0)
82
- newAudio.export(audio, format="wav")
83
- return spacermilli, spacer
84
-
85
- def diarization(audio):
86
- as_audio = AudioSegment.from_wav(audio)
87
- DEMO_FILE = {'uri': 'blabal', 'audio': audio}
88
- if NumberOfSpeakers:
89
- dz = pipeline(DEMO_FILE, num_speakers=NumberOfSpeakers)
90
- else:
91
- dz = pipeline(DEMO_FILE)
92
- with open(CreateFile(f"diarization_{audio}.txt"), "w") as text_file:
93
- text_file.write(str(dz))
94
- dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
95
- dzList = []
96
- for l in dz:
97
- start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
98
- start = millisec(start)
99
- end = millisec(end)
100
- lex = GetSpeaker(re.findall('(SPEAKER_[0-9][0-9])', string=l)[0])
101
- dzList.append([start, end, lex])
102
- sounds = spacer
103
- segments = []
104
- dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
105
- for l in dz:
106
- start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
107
- start = millisec(start)
108
- end = millisec(end)
109
- segments.append(len(sounds))
110
- sounds = sounds.append(as_audio[start:end], crossfade=0)
111
- sounds = sounds.append(spacer, crossfade=0)
112
- sounds.export(CreateFile(f"dz_{audio}.wav"), format="wav")
113
- return f"dz_{audio}.wav", dzList, segments
114
-
115
- def transcribe(dz_audio):
116
- model = whisper.load_model("base")
117
- result = model.transcribe(dz_audio)
118
- # for _ in result['segments']:
119
- # print(_['start'], _['end'], _['text'])
120
- captions = [[((caption["start"]*1000)), ((caption["end"]*1000)), caption["text"]] for caption in result['segments']]
121
- conversation = []
122
- for i in range(len(segments)):
123
- idx = 0
124
- for idx in range(len(captions)):
125
- if captions[idx][0] >= (segments[i] - spacermilli):
126
- break;
127
-
128
- while (idx < (len(captions))) and ((i == len(segments) - 1) or (captions[idx][1] < segments[i+1])):
129
- c = captions[idx]
130
- start = dzList[i][0] + (c[0] -segments[i])
131
- if start < 0:
132
- start = 0
133
- idx += 1
134
- if not len(conversation):
135
- conversation.append([dzList[i][2], c[2]])
136
- elif conversation[-1][0] == dzList[i][2]:
137
- conversation[-1][1] += c[2]
138
- else:
139
- conversation.append([dzList[i][2], c[2]])
140
- #print(f"[{dzList[i][2]}] {c[2]}")
141
- return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
142
 
143
- GenerateSpeakerDict(SpeakerNames)
144
- spacermilli, spacer = preprocess(audio)
145
- dz_audio, dzList, segments = diarization(audio)
146
- conversation, t_text = transcribe(dz_audio)
147
- RemoveAllFiles()
148
- return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
149
 
150
 
151
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
152
- SPEAKER_DICT = {}
153
- SPEAKERS = []
154
- def GetSpeaker(sp):
155
- speaker = sp
156
- if sp not in list(SPEAKER_DICT.keys()):
157
- if len(SPEAKERS):
158
- t = SPEAKERS.pop(0)
159
- SPEAKER_DICT[sp] = t
160
- speaker = SPEAKER_DICT[sp]
161
- else:
162
- speaker = SPEAKER_DICT[sp]
163
- return speaker
164
-
165
- def GenerateSpeakerDict(sp):
166
- global SPEAKERS
167
- SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
168
 
169
- # audio = Audio()
170
- GenerateSpeakerDict(speaker_names)
171
- def get_output(segments):
172
- # print(segments)
173
- conversation=[]
174
- for (i, segment) in enumerate(segments):
175
- if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
176
- if i != 0:
177
- conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
178
- conversation[-1][1] += segment["text"][1:]
179
- # return output
180
- return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
 
181
 
182
- def get_duration(path):
183
- with contextlib.closing(wave.open(path,'r')) as f:
184
- frames = f.getnframes()
185
- rate = f.getframerate()
186
- return frames / float(rate)
187
 
188
- def make_embeddings(path, segments, duration):
189
- embeddings = np.zeros(shape=(len(segments), 192))
190
- for i, segment in enumerate(segments):
191
- embeddings[i] = segment_embedding(path, segment, duration)
192
- return np.nan_to_num(embeddings)
193
 
194
- def segment_embedding(path, segment, duration):
195
- start = segment["start"]
196
- # Whisper overshoots the end timestamp in the last segment
197
- end = min(duration, segment["end"])
198
- clip = Segment(start, end)
199
- waveform, sample_rate = Audio().crop(path, clip)
200
- return embedding_model(waveform[None])
201
 
202
- def add_speaker_labels(segments, embeddings, num_speakers):
203
- clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
204
- labels = clustering.labels_
205
- for i in range(len(segments)):
206
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
207
 
208
- def time(secs):
209
- return datetime.timedelta(seconds=round(secs))
210
 
211
- duration = get_duration(audio)
212
- if duration > 4 * 60 * 60:
213
- return "Audio duration too long"
214
 
215
- result = model.transcribe(audio)
216
 
217
- segments = result["segments"]
218
 
219
- num_speakers = min(max(round(num_speakers), 1), len(segments))
220
- if len(segments) == 1:
221
- segments[0]['speaker'] = 'SPEAKER 1'
222
- else:
223
- embeddings = make_embeddings(audio, segments, duration)
224
- add_speaker_labels(segments, embeddings, num_speakers)
225
- return get_output(segments)
226
- # return output
227
 
228
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
229
- if retries:
230
- # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
231
- try:
232
- subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
233
- except Exception as ex:
234
- traceback.print_exc()
235
- return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
236
- if not (os.path.isfile("temp_audio.wav")):
237
- return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
238
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
239
- else:
240
- raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
241
 
242
  def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
243
- if retries:
244
- try:
245
- clip = mp.VideoFileClip(video)
246
- clip.audio.write_audiofile("temp_audio.wav")
247
- # command = f"ffmpeg -i {video} -ab 160k -ac 2 -ar 44100 -vn temp_audio.wav"
248
- # subprocess.call(command, shell=True)
249
- except Exception as ex:
250
- traceback.print_exc()
251
- return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
252
- if not (os.path.isfile("temp_audio.wav")):
253
- return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
254
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
255
- else:
256
- raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
257
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
258
 
259
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
260
- if retries:
261
- if "youtu" not in URL.lower():
262
- raise gr.Error(f"{URL} is not a valid youtube URL.")
263
- else:
264
- RemoveFile("temp_audio.wav")
265
- ydl_opts = {
266
- 'format': 'bestaudio/best',
267
- 'outtmpl': 'temp_audio.%(ext)s',
268
- 'postprocessors': [{
269
- 'key': 'FFmpegExtractAudio',
270
- 'preferredcodec': 'wav',
271
- }],
272
- }
273
- try:
274
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
275
- ydl.download([URL])
276
- except:
277
- return YoutubeTranscribe(NumberOfSpeakers, SpeakerNames, URL, retries-1)
278
- stream = ffmpeg.input('temp_audio.m4a')
279
- stream = ffmpeg.output(stream, 'temp_audio.wav')
280
- RemoveFile("temp_audio.m4a")
281
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
282
- else:
283
- raise gr.Error(f"Unable to get video from {URL}")
284
 
285
  ut = gr.Interface(
286
- fn=YoutubeTranscribe,
287
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
288
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
289
  )
290
  vt = gr.Interface(
291
- fn=VideoTranscribe,
292
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
293
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
294
  )
295
  at = gr.Interface(
296
- fn=AudioTranscribe,
297
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
298
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
299
  )
300
 
301
  demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
 
26
 
27
  model = whisper.load_model("medium")
28
  embedding_model = PretrainedSpeakerEmbedding(
29
+ "speechbrain/spkrec-ecapa-voxceleb",
30
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31
  )
32
 
33
 
34
  __FILES = set()
35
+
36
 
37
  def CreateFile(filename):
38
+ __FILES.add(filename)
39
+ return filename
40
 
41
  def RemoveFile(filename):
42
+ if (os.path.isfile(filename)):
43
+ os.remove(filename)
44
 
45
  def RemoveAllFiles():
46
+ for file in __FILES:
47
+ if (os.path.isfile(file)):
48
+ os.remove(file)
49
+
50
  def Transcribe(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
51
+ SPEAKER_DICT = {}
52
+ SPEAKERS = []
53
+
54
+ def GetSpeaker(sp):
55
+ speaker = sp
56
+ if sp not in list(SPEAKER_DICT.keys()):
57
+ if len(SPEAKERS):
58
+ t = SPEAKERS.pop(0)
59
+ SPEAKER_DICT[sp] = t
60
+ speaker = SPEAKER_DICT[sp]
61
+ else:
62
+ speaker = SPEAKER_DICT[sp]
63
+ return speaker
64
+
65
+ def GenerateSpeakerDict(sp):
66
+ global SPEAKERS
67
+ SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
68
+
69
+ def millisec(timeStr):
70
+ spl = timeStr.split(":")
71
+ s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
72
+ return s
73
+
74
+ def preprocess(audio):
75
+ t1 = 0 * 1000
76
+ t2 = 20 * 60 * 1000
77
+ newAudio = AudioSegment.from_wav(audio)
78
+ a = newAudio[t1:t2]
79
+ spacermilli = 2000
80
+ spacer = AudioSegment.silent(duration=spacermilli)
81
+ newAudio = spacer.append(a, crossfade=0)
82
+ newAudio.export(audio, format="wav")
83
+ return spacermilli, spacer
84
+
85
+ def diarization(audio):
86
+ as_audio = AudioSegment.from_wav(audio)
87
+ DEMO_FILE = {'uri': 'blabal', 'audio': audio}
88
+ if NumberOfSpeakers:
89
+ dz = pipeline(DEMO_FILE, num_speakers=NumberOfSpeakers)
90
+ else:
91
+ dz = pipeline(DEMO_FILE)
92
+ with open(CreateFile(f"diarization_{audio}.txt"), "w") as text_file:
93
+ text_file.write(str(dz))
94
+ dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
95
+ dzList = []
96
+ for l in dz:
97
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
98
+ start = millisec(start)
99
+ end = millisec(end)
100
+ lex = GetSpeaker(re.findall('(SPEAKER_[0-9][0-9])', string=l)[0])
101
+ dzList.append([start, end, lex])
102
+ sounds = spacer
103
+ segments = []
104
+ dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
105
+ for l in dz:
106
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
107
+ start = millisec(start)
108
+ end = millisec(end)
109
+ segments.append(len(sounds))
110
+ sounds = sounds.append(as_audio[start:end], crossfade=0)
111
+ sounds = sounds.append(spacer, crossfade=0)
112
+ sounds.export(CreateFile(f"dz_{audio}.wav"), format="wav")
113
+ return f"dz_{audio}.wav", dzList, segments
114
+
115
+ def transcribe(dz_audio):
116
+ model = whisper.load_model("base")
117
+ result = model.transcribe(dz_audio)
118
+ # for _ in result['segments']:
119
+ # print(_['start'], _['end'], _['text'])
120
+ captions = [[((caption["start"]*1000)), ((caption["end"]*1000)), caption["text"]] for caption in result['segments']]
121
+ conversation = []
122
+ for i in range(len(segments)):
123
+ idx = 0
124
+ for idx in range(len(captions)):
125
+ if captions[idx][0] >= (segments[i] - spacermilli):
126
+ break;
127
+
128
+ while (idx < (len(captions))) and ((i == len(segments) - 1) or (captions[idx][1] < segments[i+1])):
129
+ c = captions[idx]
130
+ start = dzList[i][0] + (c[0] -segments[i])
131
+ if start < 0:
132
+ start = 0
133
+ idx += 1
134
+ if not len(conversation):
135
+ conversation.append([dzList[i][2], c[2]])
136
+ elif conversation[-1][0] == dzList[i][2]:
137
+ conversation[-1][1] += c[2]
138
+ else:
139
+ conversation.append([dzList[i][2], c[2]])
140
+ #print(f"[{dzList[i][2]}] {c[2]}")
141
+ return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
142
 
143
+ GenerateSpeakerDict(SpeakerNames)
144
+ spacermilli, spacer = preprocess(audio)
145
+ dz_audio, dzList, segments = diarization(audio)
146
+ conversation, t_text = transcribe(dz_audio)
147
+ RemoveAllFiles()
148
+ return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
149
 
150
 
151
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
152
+ SPEAKER_DICT = {}
153
+ SPEAKERS = []
154
+ def GetSpeaker(sp):
155
+ speaker = sp
156
+ if sp not in list(SPEAKER_DICT.keys()):
157
+ if len(SPEAKERS):
158
+ t = SPEAKERS.pop(0)
159
+ SPEAKER_DICT[sp] = t
160
+ speaker = SPEAKER_DICT[sp]
161
+ else:
162
+ speaker = SPEAKER_DICT[sp]
163
+ return speaker
164
+
165
+ def GenerateSpeakerDict(sp):
166
+ global SPEAKERS
167
+ SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
168
 
169
+ # audio = Audio()
170
+ GenerateSpeakerDict(speaker_names)
171
+ def get_output(segments):
172
+ # print(segments)
173
+ conversation=[]
174
+ for (i, segment) in enumerate(segments):
175
+ print(f"{i}, {segment["speaker"]}, {segments[i - 1]["speaker"]}")
176
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
177
+ if i != 0:
178
+ conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
179
+ conversation[-1][1] += segment["text"][1:]
180
+ # return output
181
+ return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
182
 
183
+ def get_duration(path):
184
+ with contextlib.closing(wave.open(path,'r')) as f:
185
+ frames = f.getnframes()
186
+ rate = f.getframerate()
187
+ return frames / float(rate)
188
 
189
+ def make_embeddings(path, segments, duration):
190
+ embeddings = np.zeros(shape=(len(segments), 192))
191
+ for i, segment in enumerate(segments):
192
+ embeddings[i] = segment_embedding(path, segment, duration)
193
+ return np.nan_to_num(embeddings)
194
 
195
+ def segment_embedding(path, segment, duration):
196
+ start = segment["start"]
197
+ # Whisper overshoots the end timestamp in the last segment
198
+ end = min(duration, segment["end"])
199
+ clip = Segment(start, end)
200
+ waveform, sample_rate = Audio().crop(path, clip)
201
+ return embedding_model(waveform[None])
202
 
203
+ def add_speaker_labels(segments, embeddings, num_speakers):
204
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
205
+ labels = clustering.labels_
206
+ for i in range(len(segments)):
207
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
208
 
209
+ def time(secs):
210
+ return datetime.timedelta(seconds=round(secs))
211
 
212
+ duration = get_duration(audio)
213
+ if duration > 4 * 60 * 60:
214
+ return "Audio duration too long"
215
 
216
+ result = model.transcribe(audio)
217
 
218
+ segments = result["segments"]
219
 
220
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
221
+ if len(segments) == 1:
222
+ segments[0]['speaker'] = 'SPEAKER 1'
223
+ else:
224
+ embeddings = make_embeddings(audio, segments, duration)
225
+ add_speaker_labels(segments, embeddings, num_speakers)
226
+ return get_output(segments)
227
+ # return output
228
 
229
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
230
+ if retries:
231
+ # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
232
+ try:
233
+ subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
234
+ except Exception as ex:
235
+ traceback.print_exc()
236
+ return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
237
+ if not (os.path.isfile("temp_audio.wav")):
238
+ return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
239
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
240
+ else:
241
+ raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
242
 
243
  def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
244
+ if retries:
245
+ try:
246
+ clip = mp.VideoFileClip(video)
247
+ clip.audio.write_audiofile("temp_audio.wav")
248
+ # command = f"ffmpeg -i {video} -ab 160k -ac 2 -ar 44100 -vn temp_audio.wav"
249
+ # subprocess.call(command, shell=True)
250
+ except Exception as ex:
251
+ traceback.print_exc()
252
+ return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
253
+ if not (os.path.isfile("temp_audio.wav")):
254
+ return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
255
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
256
+ else:
257
+ raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
258
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
259
 
260
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
261
+ if retries:
262
+ if "youtu" not in URL.lower():
263
+ raise gr.Error(f"{URL} is not a valid youtube URL.")
264
+ else:
265
+ RemoveFile("temp_audio.wav")
266
+ ydl_opts = {
267
+ 'format': 'bestaudio/best',
268
+ 'outtmpl': 'temp_audio.%(ext)s',
269
+ 'postprocessors': [{
270
+ 'key': 'FFmpegExtractAudio',
271
+ 'preferredcodec': 'wav',
272
+ }],
273
+ }
274
+ try:
275
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
276
+ ydl.download([URL])
277
+ except:
278
+ return YoutubeTranscribe(NumberOfSpeakers, SpeakerNames, URL, retries-1)
279
+ stream = ffmpeg.input('temp_audio.m4a')
280
+ stream = ffmpeg.output(stream, 'temp_audio.wav')
281
+ RemoveFile("temp_audio.m4a")
282
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
283
+ else:
284
+ raise gr.Error(f"Unable to get video from {URL}")
285
 
286
  ut = gr.Interface(
287
+ fn=YoutubeTranscribe,
288
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
289
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
290
  )
291
  vt = gr.Interface(
292
+ fn=VideoTranscribe,
293
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
294
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
295
  )
296
  at = gr.Interface(
297
+ fn=AudioTranscribe,
298
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
299
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
300
  )
301
 
302
  demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])