salmanmapkar commited on
Commit
541f2b9
1 Parent(s): 9c3a3b5

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +208 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import unicode_literals
2
+ import youtube_dl
3
+ from pydub import AudioSegment
4
+ from pyannote.audio import Pipeline
5
+ import re
6
+ import whisper
7
+ import os
8
+ import ffmpeg
9
+ import subprocess
10
+ import gradio as gr
11
+ import traceback
12
+ import json
13
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
14
+ from pydub.effects import speedup
15
+ import moviepy.editor as mp
16
+
17
+
18
+ __FILES = set()
19
+
20
+
21
+ def CreateFile(filename):
22
+ __FILES.add(filename)
23
+ return filename
24
+
25
+ def RemoveFile(filename):
26
+ if (os.path.isfile(filename)):
27
+ os.remove(filename)
28
+
29
+ def RemoveAllFiles():
30
+ for file in __FILES:
31
+ if (os.path.isfile(file)):
32
+ os.remove(file)
33
+
34
+ def Transcribe(NumberOfSpeakers, SpeakerNames="", audio="temp_audio.wav"):
35
+ SPEAKER_DICT = {}
36
+ SPEAKERS = []
37
+
38
+ def GetSpeaker(sp):
39
+ speaker = sp
40
+ if sp not in list(SPEAKER_DICT.keys()):
41
+ if len(SPEAKERS):
42
+ t = SPEAKERS.pop(0)
43
+ SPEAKER_DICT[sp] = t
44
+ speaker = SPEAKER_DICT[sp]
45
+ else:
46
+ speaker = SPEAKER_DICT[sp]
47
+ return speaker
48
+
49
+ def GenerateSpeakerDict(sp):
50
+ global SPEAKERS
51
+ SPEAKERS = [speaker.strip() for speaker in sp.split(',')]
52
+
53
+ def millisec(timeStr):
54
+ spl = timeStr.split(":")
55
+ s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
56
+ return s
57
+
58
+ def preprocess(audio):
59
+ t1 = 0 * 1000
60
+ t2 = 20 * 60 * 1000
61
+ newAudio = AudioSegment.from_wav(audio)
62
+ a = newAudio[t1:t2]
63
+ spacermilli = 2000
64
+ spacer = AudioSegment.silent(duration=spacermilli)
65
+ newAudio = spacer.append(a, crossfade=0)
66
+ newAudio.export(audio, format="wav")
67
+ return spacermilli, spacer
68
+
69
+ def diarization(audio):
70
+ as_audio = AudioSegment.from_wav(audio)
71
+ DEMO_FILE = {'uri': 'blabal', 'audio': audio}
72
+ if NumberOfSpeakers:
73
+ dz = pipeline(DEMO_FILE, num_speakers=NumberOfSpeakers)
74
+ else:
75
+ dz = pipeline(DEMO_FILE)
76
+ with open(CreateFile(f"diarization_{audio}.txt"), "w") as text_file:
77
+ text_file.write(str(dz))
78
+ dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
79
+ dzList = []
80
+ for l in dz:
81
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
82
+ start = millisec(start)
83
+ end = millisec(end)
84
+ lex = GetSpeaker(re.findall('(SPEAKER_[0-9][0-9])', string=l)[0])
85
+ dzList.append([start, end, lex])
86
+ sounds = spacer
87
+ segments = []
88
+ dz = open(CreateFile(f"diarization_{audio}.txt")).read().splitlines()
89
+ for l in dz:
90
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
91
+ start = millisec(start)
92
+ end = millisec(end)
93
+ segments.append(len(sounds))
94
+ sounds = sounds.append(as_audio[start:end], crossfade=0)
95
+ sounds = sounds.append(spacer, crossfade=0)
96
+ sounds.export(CreateFile(f"dz_{audio}.wav"), format="wav")
97
+ return f"dz_{audio}.wav", dzList, segments
98
+
99
+ def transcribe(dz_audio):
100
+ model = whisper.load_model("base")
101
+ result = model.transcribe(dz_audio)
102
+ # for _ in result['segments']:
103
+ # print(_['start'], _['end'], _['text'])
104
+ captions = [[((caption["start"]*1000)), ((caption["end"]*1000)), caption["text"]] for caption in result['segments']]
105
+ conversation = []
106
+ for i in range(len(segments)):
107
+ idx = 0
108
+ for idx in range(len(captions)):
109
+ if captions[idx][0] >= (segments[i] - spacermilli):
110
+ break;
111
+
112
+ while (idx < (len(captions))) and ((i == len(segments) - 1) or (captions[idx][1] < segments[i+1])):
113
+ c = captions[idx]
114
+ start = dzList[i][0] + (c[0] -segments[i])
115
+ if start < 0:
116
+ start = 0
117
+ idx += 1
118
+ if not len(conversation):
119
+ conversation.append([dzList[i][2], c[2]])
120
+ elif conversation[-1][0] == dzList[i][2]:
121
+ conversation[-1][1] += c[2]
122
+ else:
123
+ conversation.append([dzList[i][2], c[2]])
124
+ #print(f"[{dzList[i][2]}] {c[2]}")
125
+ return conversation, ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
126
+
127
+ GenerateSpeakerDict(SpeakerNames)
128
+ spacermilli, spacer = preprocess(audio)
129
+ dz_audio, dzList, segments = diarization(audio)
130
+ conversation, t_text = transcribe(dz_audio)
131
+ RemoveAllFiles()
132
+ return (t_text, ({ "data": [{"speaker": speaker, "text": text} for speaker, text in conversation]}))
133
+
134
+ def AudioTranscribe(NumberOfSpeakers=None, SpeakerNames="", audio="", retries=5):
135
+ if retries:
136
+ # subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
137
+ try:
138
+ subprocess.call(['ffmpeg', '-i', audio,'temp_audio.wav'])
139
+ except Exception as ex:
140
+ traceback.print_exc()
141
+ return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
142
+ if not (os.path.isfile("temp_audio.wav")):
143
+ return AudioTranscribe(NumberOfSpeakers, SpeakerNames, audio, retries-1)
144
+ return Transcribe(NumberOfSpeakers, SpeakerNames)
145
+ else:
146
+ raise gr.Error("There is some issue ith Audio Transcriber. Please try again later!")
147
+
148
+ def VideoTranscribe(NumberOfSpeakers=None, SpeakerNames="", video="", retries=5):
149
+ if retries:
150
+ try:
151
+ clip = mp.VideoFileClip(video)
152
+ clip.audio.write_audiofile("temp_audio.wav")
153
+ # command = f"ffmpeg -i {video} -ab 160k -ac 2 -ar 44100 -vn temp_audio.wav"
154
+ # subprocess.call(command, shell=True)
155
+ except Exception as ex:
156
+ traceback.print_exc()
157
+ return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
158
+ if not (os.path.isfile("temp_audio.wav")):
159
+ return VideoTranscribe(NumberOfSpeakers, SpeakerNames, video, retries-1)
160
+ return Transcribe(NumberOfSpeakers, SpeakerNames)
161
+ else:
162
+ raise gr.Error("There is some issue ith Video Transcriber. Please try again later!")
163
+ return Transcribe(NumberOfSpeakers, SpeakerNames)
164
+
165
+ def YoutubeTranscribe(NumberOfSpeakers=None, SpeakerNames="", URL="", retries = 5):
166
+ if retries:
167
+ if "youtu" not in URL.lower():
168
+ raise gr.Error(f"{URL} is not a valid youtube URL.")
169
+ else:
170
+ RemoveFile("temp_audio.wav")
171
+ ydl_opts = {
172
+ 'format': 'bestaudio/best',
173
+ 'outtmpl': 'temp_audio.%(ext)s',
174
+ 'postprocessors': [{
175
+ 'key': 'FFmpegExtractAudio',
176
+ 'preferredcodec': 'wav',
177
+ }],
178
+ }
179
+ try:
180
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
181
+ ydl.download([URL])
182
+ except:
183
+ return YoutubeTranscribe(NumberOfSpeakers, SpeakerNames, URL, retries-1)
184
+ stream = ffmpeg.input('temp_audio.m4a')
185
+ stream = ffmpeg.output(stream, 'temp_audio.wav')
186
+ RemoveFile("temp_audio.m4a")
187
+ return Transcribe(NumberOfSpeakers, SpeakerNames)
188
+ else:
189
+ raise gr.Error(f"Unable to get video from {URL}")
190
+
191
+ ut = gr.Interface(
192
+ fn=YoutubeTranscribe,
193
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), gr.Textbox(label="Youtube Link", placeholder="https://www.youtube.com/watch?v=GECcjrYHH8w"),],
194
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
195
+ )
196
+ vt = gr.Interface(
197
+ fn=VideoTranscribe,
198
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'video'],
199
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
200
+ )
201
+ at = gr.Interface(
202
+ fn=AudioTranscribe,
203
+ inputs=[gr.Number(label="Number of Speakers", placeholder="2"), gr.Textbox(label="Name of the Speakers (ordered by the time they speak and separated by comma)", placeholder="If Speaker 1 is first to speak followed by Speaker 2 then -> Speaker 1, Speaker 2"), 'audio'],
204
+ outputs=[gr.Textbox(label="Transcribed Text", lines=15), gr.JSON(label="Transcribed JSON")]
205
+ )
206
+
207
+ demo = gr.TabbedInterface([ut, vt, at], ["Youtube URL", "Video", "Audio"])
208
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pydub
2
+ pyannote.audio
3
+ git+https://github.com/openai/whisper.git
4
+ youtube-dl
5
+ ffmpeg-python
6
+ gradio
7
+ moviepy