salmanmapkar commited on
Commit
7dc348e
1 Parent(s): 2e0ad83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +189 -189
app.py CHANGED
@@ -32,152 +32,152 @@ embedding_model = PretrainedSpeakerEmbedding(
32
 
33
 
34
  __FILES = set()
35
-
36
 
37
  def CreateFile(filename):
38
- __FILES.add(filename)
39
- return filename
40
 
41
  def RemoveFile(filename):
42
- if (os.path.isfile(filename)):
43
- os.remove(filename)
44
 
45
  def RemoveAllFiles():
46
- for file in __FILES:
47
- if (os.path.isfile(file)):
48
- os.remove(file)
49
-
50
  def Transcribe(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
51
- SPEAKER_DICT = {}
52
- SPEAKERS = []
53
-
54
- def GetSpeaker(sp):
55
- speaker = sp
56
- if sp not in list(SPEAKER_DICT.keys()):
57
- if len(SPEAKERS):
58
- t = SPEAKERS.pop(0)
59
- SPEAKER_DICT[sp] = t
60
- speaker = SPEAKER_DICT[sp]
61
- else:
62
- speaker = SPEAKER_DICT[sp]
63
- return speaker
64
-
65
- def GenerateSpeakerDict(sp):
66
- global SPEAKERS
67
- SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
68
-
69
- def millisec(timeStr):
70
- spl = timeStr.split(":")
71
- s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
72
- return s
73
-
74
- def preprocess(audio):
75
- t1 = 0 * 1000
76
- t2 = 20 * 60 * 1000
77
- newAudio = AudioSegment.from_wav(audio)
78
- a = newAudio[t1:t2]
79
- spacermilli = 2000
80
- spacer = AudioSegment.silent(duration=spacermilli)
81
- newAudio = spacer.append(a, crossfade=0)
82
- newAudio.export(audio, format="wav")
83
- return spacermilli, spacer
84
-
85
- def diarization(audio):
86
- as_audio = AudioSegment.from_wav(audio)
87
- DEMO_FILE = {'uri': 'blabal', 'audio': audio}
88
- if NumberOfSpeakers:
89
- dz = pipeline(DEMO_FILE, num_speakers=NumberOfSpeakers)
90
- else:
91
- dz = pipeline(DEMO_FILE)
92
- with open(CreateFile(f"diarization_{audio}.txt"), "w") as text_file:
93
- text_file.write(str(dz))
94
- dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
95
- dzList = []
96
- for l in dz:
97
- start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
98
- start = millisec(start)
99
- end = millisec(end)
100
- lex = GetSpeaker(re.findall('(SPEAKER_[0-9][0-9])', string=l)[0])
101
- dzList.append([start, end, lex])
102
- sounds = spacer
103
- segments = []
104
- dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
105
- for l in dz:
106
- start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
107
- start = millisec(start)
108
- end = millisec(end)
109
- segments.append(len(sounds))
110
- sounds = sounds.append(as_audio[start:end], crossfade=0)
111
- sounds = sounds.append(spacer, crossfade=0)
112
- sounds.export(CreateFile(f"dz_{audio}.wav"), format="wav")
113
- return f"dz_{audio}.wav", dzList, segments
114
-
115
- def transcribe(dz_audio):
116
- model = whisper.load_model("base")
117
- result = model.transcribe(dz_audio)
118
- # for _ in result['segments']:
119
- # print(_['start'], _['end'], _['text'])
120
- captions = [[((caption["start"]*1000)), ((caption["end"]*1000)), caption["text"]] for caption in result['segments']]
121
- conversation = []
122
- for i in range(len(segments)):
123
- idx = 0
124
- for idx in range(len(captions)):
125
- if captions[idx][0] >= (segments[i] - spacermilli):
126
- break;
127
-
128
- while (idx < (len(captions))) and ((i == len(segments) - 1) or (captions[idx][1] < segments[i+1])):
129
- c = captions[idx]
130
- start = dzList[i][0] + (c[0] -segments[i])
131
- if start < 0:
132
- start = 0
133
- idx += 1
134
- if not len(conversation):
135
- conversation.append([dzList[i][2], c[2]])
136
- elif conversation[-1][0] == dzList[i][2]:
137
- conversation[-1][1] += c[2]
138
- else:
139
- conversation.append([dzList[i][2], c[2]])
140
- #print(f"[{dzList[i][2]}] {c[2]}")
141
- return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
142
 
143
- GenerateSpeakerDict(SpeakerNames)
144
- spacermilli, spacer = preprocess(audio)
145
- dz_audio, dzList, segments = diarization(audio)
146
- conversation, t_text = transcribe(dz_audio)
147
- RemoveAllFiles()
148
- return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
149
 
150
 
151
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
152
- SPEAKER_DICT = {}
153
- SPEAKERS = []
154
- def GetSpeaker(sp):
155
- speaker = sp
156
- if sp not in list(SPEAKER_DICT.keys()):
157
- if len(SPEAKERS):
158
- t = SPEAKERS.pop(0)
159
- SPEAKER_DICT[sp] = t
160
- speaker = SPEAKER_DICT[sp]
161
- else:
162
- speaker = SPEAKER_DICT[sp]
163
- return speaker
164
-
165
- def GenerateSpeakerDict(sp):
166
- global SPEAKERS
167
- SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
168
 
169
- audio = Audio()
170
- GenerateSpeakerDict(speaker_names)
171
  def get_output(segments):
172
  # print(segments)
173
  output = ''
174
  for (i, segment) in enumerate(segments):
175
- if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
176
- if i != 0:
177
- conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
178
- conversation[-1][1] += segment["text"][1:]
179
  # return output
180
- return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
181
 
182
  def get_duration(path):
183
  with contextlib.closing(wave.open(path,'r')) as f:
@@ -226,76 +226,76 @@ def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
226
  # return output
227
 
228
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
229
- if retries:
230
- # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
231
- try:
232
- subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
233
- except Exception as ex:
234
- traceback.print_exc()
235
- return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
236
- if not (os.path.isfile("temp_audio.wav")):
237
- return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
238
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
239
- else:
240
- raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
241
 
242
  def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
243
- if retries:
244
- try:
245
- clip = mp.VideoFileClip(video)
246
- clip.audio.write_audiofile("temp_audio.wav")
247
- # command = f"ffmpeg -i {video} -ab 160k -ac 2 -ar 44100 -vn temp_audio.wav"
248
- # subprocess.call(command, shell=True)
249
- except Exception as ex:
250
- traceback.print_exc()
251
- return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
252
- if not (os.path.isfile("temp_audio.wav")):
253
- return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
254
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
255
- else:
256
- raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
257
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
258
 
259
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
260
- if retries:
261
- if "youtu" not in URL.lower():
262
- raise gr.Error(f"{URL} is not a valid youtube URL.")
263
- else:
264
- RemoveFile("temp_audio.wav")
265
- ydl_opts = {
266
- 'format': 'bestaudio/best',
267
- 'outtmpl': 'temp_audio.%(ext)s',
268
- 'postprocessors': [{
269
- 'key': 'FFmpegExtractAudio',
270
- 'preferredcodec': 'wav',
271
- }],
272
- }
273
- try:
274
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
275
- ydl.download([URL])
276
- except:
277
- return YoutubeTranscribe(NumberOfSpeakers, SpeakerNames, URL, retries-1)
278
- stream = ffmpeg.input('temp_audio.m4a')
279
- stream = ffmpeg.output(stream, 'temp_audio.wav')
280
- RemoveFile("temp_audio.m4a")
281
- return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
282
- else:
283
- raise gr.Error(f"Unable to get video from {URL}")
284
 
285
  ut = gr.Interface(
286
- fn=YoutubeTranscribe,
287
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
288
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
289
  )
290
  vt = gr.Interface(
291
- fn=VideoTranscribe,
292
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
293
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
294
  )
295
  at = gr.Interface(
296
- fn=AudioTranscribe,
297
- inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
298
- outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
299
  )
300
 
301
  demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
 
32
 
33
 
34
  __FILES = set()
35
+
36
 
37
  def CreateFile(filename):
38
+ __FILES.add(filename)
39
+ return filename
40
 
41
  def RemoveFile(filename):
42
+ if (os.path.isfile(filename)):
43
+ os.remove(filename)
44
 
45
  def RemoveAllFiles():
46
+ for file in __FILES:
47
+ if (os.path.isfile(file)):
48
+ os.remove(file)
49
+
50
  def Transcribe(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
51
+ SPEAKER_DICT = {}
52
+ SPEAKERS = []
53
+
54
+ def GetSpeaker(sp):
55
+ speaker = sp
56
+ if sp not in list(SPEAKER_DICT.keys()):
57
+ if len(SPEAKERS):
58
+ t = SPEAKERS.pop(0)
59
+ SPEAKER_DICT[sp] = t
60
+ speaker = SPEAKER_DICT[sp]
61
+ else:
62
+ speaker = SPEAKER_DICT[sp]
63
+ return speaker
64
+
65
+ def GenerateSpeakerDict(sp):
66
+ global SPEAKERS
67
+ SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
68
+
69
+ def millisec(timeStr):
70
+ spl = timeStr.split(":")
71
+ s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
72
+ return s
73
+
74
+ def preprocess(audio):
75
+ t1 = 0 * 1000
76
+ t2 = 20 * 60 * 1000
77
+ newAudio = AudioSegment.from_wav(audio)
78
+ a = newAudio[t1:t2]
79
+ spacermilli = 2000
80
+ spacer = AudioSegment.silent(duration=spacermilli)
81
+ newAudio = spacer.append(a, crossfade=0)
82
+ newAudio.export(audio, format="wav")
83
+ return spacermilli, spacer
84
+
85
+ def diarization(audio):
86
+ as_audio = AudioSegment.from_wav(audio)
87
+ DEMO_FILE = {'uri': 'blabal', 'audio': audio}
88
+ if NumberOfSpeakers:
89
+ dz = pipeline(DEMO_FILE, num_speakers=NumberOfSpeakers)
90
+ else:
91
+ dz = pipeline(DEMO_FILE)
92
+ with open(CreateFile(f"diarization_{audio}.txt"), "w") as text_file:
93
+ text_file.write(str(dz))
94
+ dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
95
+ dzList = []
96
+ for l in dz:
97
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
98
+ start = millisec(start)
99
+ end = millisec(end)
100
+ lex = GetSpeaker(re.findall('(SPEAKER_[0-9][0-9])', string=l)[0])
101
+ dzList.append([start, end, lex])
102
+ sounds = spacer
103
+ segments = []
104
+ dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
105
+ for l in dz:
106
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
107
+ start = millisec(start)
108
+ end = millisec(end)
109
+ segments.append(len(sounds))
110
+ sounds = sounds.append(as_audio[start:end], crossfade=0)
111
+ sounds = sounds.append(spacer, crossfade=0)
112
+ sounds.export(CreateFile(f"dz_{audio}.wav"), format="wav")
113
+ return f"dz_{audio}.wav", dzList, segments
114
+
115
+ def transcribe(dz_audio):
116
+ model = whisper.load_model("base")
117
+ result = model.transcribe(dz_audio)
118
+ # for _ in result['segments']:
119
+ # print(_['start'], _['end'], _['text'])
120
+ captions = [[((caption["start"]*1000)), ((caption["end"]*1000)), caption["text"]] for caption in result['segments']]
121
+ conversation = []
122
+ for i in range(len(segments)):
123
+ idx = 0
124
+ for idx in range(len(captions)):
125
+ if captions[idx][0] >= (segments[i] - spacermilli):
126
+ break;
127
+
128
+ while (idx < (len(captions))) and ((i == len(segments) - 1) or (captions[idx][1] < segments[i+1])):
129
+ c = captions[idx]
130
+ start = dzList[i][0] + (c[0] -segments[i])
131
+ if start < 0:
132
+ start = 0
133
+ idx += 1
134
+ if not len(conversation):
135
+ conversation.append([dzList[i][2], c[2]])
136
+ elif conversation[-1][0] == dzList[i][2]:
137
+ conversation[-1][1] += c[2]
138
+ else:
139
+ conversation.append([dzList[i][2], c[2]])
140
+ #print(f"[{dzList[i][2]}] {c[2]}")
141
+ return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
142
 
143
+ GenerateSpeakerDict(SpeakerNames)
144
+ spacermilli, spacer = preprocess(audio)
145
+ dz_audio, dzList, segments = diarization(audio)
146
+ conversation, t_text = transcribe(dz_audio)
147
+ RemoveAllFiles()
148
+ return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
149
 
150
 
151
  def Transcribe_V2(num_speakers, speaker_names, audio="temp_audio.wav"):
152
+ SPEAKER_DICT = {}
153
+ SPEAKERS = []
154
+ def GetSpeaker(sp):
155
+ speaker = sp
156
+ if sp not in list(SPEAKER_DICT.keys()):
157
+ if len(SPEAKERS):
158
+ t = SPEAKERS.pop(0)
159
+ SPEAKER_DICT[sp] = t
160
+ speaker = SPEAKER_DICT[sp]
161
+ else:
162
+ speaker = SPEAKER_DICT[sp]
163
+ return speaker
164
+
165
+ def GenerateSpeakerDict(sp):
166
+ global SPEAKERS
167
+ SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
168
 
169
+ audio = Audio()
170
+ GenerateSpeakerDict(speaker_names)
171
  def get_output(segments):
172
  # print(segments)
173
  output = ''
174
  for (i, segment) in enumerate(segments):
175
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
176
+ if i != 0:
177
+ conversation.append([GetSpeaker(segment["speaker"]), segment["text"][1:]]) # segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
178
+ conversation[-1][1] += segment["text"][1:]
179
  # return output
180
+ return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
181
 
182
  def get_duration(path):
183
  with contextlib.closing(wave.open(path,'r')) as f:
 
226
  # return output
227
 
228
  def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
229
+ if retries:
230
+ # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
231
+ try:
232
+ subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
233
+ except Exception as ex:
234
+ traceback.print_exc()
235
+ return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
236
+ if not (os.path.isfile("temp_audio.wav")):
237
+ return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
238
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
239
+ else:
240
+ raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
241
 
242
  def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
243
+ if retries:
244
+ try:
245
+ clip = mp.VideoFileClip(video)
246
+ clip.audio.write_audiofile("temp_audio.wav")
247
+ # command = f"ffmpeg -i {video} -ab 160k -ac 2 -ar 44100 -vn temp_audio.wav"
248
+ # subprocess.call(command, shell=True)
249
+ except Exception as ex:
250
+ traceback.print_exc()
251
+ return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
252
+ if not (os.path.isfile("temp_audio.wav")):
253
+ return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
254
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
255
+ else:
256
+ raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
257
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
258
 
259
  def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
260
+ if retries:
261
+ if "youtu" not in URL.lower():
262
+ raise gr.Error(f"{URL} is not a valid youtube URL.")
263
+ else:
264
+ RemoveFile("temp_audio.wav")
265
+ ydl_opts = {
266
+ 'format': 'bestaudio/best',
267
+ 'outtmpl': 'temp_audio.%(ext)s',
268
+ 'postprocessors': [{
269
+ 'key': 'FFmpegExtractAudio',
270
+ 'preferredcodec': 'wav',
271
+ }],
272
+ }
273
+ try:
274
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
275
+ ydl.download([URL])
276
+ except:
277
+ return YoutubeTranscribe(NumberOfSpeakers, SpeakerNames, URL, retries-1)
278
+ stream = ffmpeg.input('temp_audio.m4a')
279
+ stream = ffmpeg.output(stream, 'temp_audio.wav')
280
+ RemoveFile("temp_audio.m4a")
281
+ return Transcribe_V2(NumberOfSpeakers, SpeakerNames)
282
+ else:
283
+ raise gr.Error(f"Unable to get video from {URL}")
284
 
285
  ut = gr.Interface(
286
+ fn=YoutubeTranscribe,
287
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
288
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
289
  )
290
  vt = gr.Interface(
291
+ fn=VideoTranscribe,
292
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
293
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
294
  )
295
  at = gr.Interface(
296
+ fn=AudioTranscribe,
297
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
298
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
299
  )
300
 
301
  demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])