salmanmapkar commited on
Commit
8106a70
1 Parent(s): c37724a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !pip install youtube-dl
2
+ from __future__ import unicode_literals
3
+ import youtube_dl
4
+ from pydub import AudioSegment
5
+ from pyannote.audio import Pipeline
6
+ import re
7
+ import webvtt
8
+ import whisper
9
+ import os
10
+ from pydub.utils import which
11
+ import ffmpeg
12
+ import webvtt
13
+ import pprint
14
+ from urllib.error import HTTPError
15
+ import subprocess
16
+ import gradio as gr
17
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token="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")
18
+
19
+
20
+ def Transcribe(audio="temp_audio.wav"):
21
+ def millisec(timeStr):
22
+ spl = timeStr.split(":")
23
+ s = (int)((int(spl[0]) * 60 * 60 + int(spl[1]) * 60 + float(spl[2]) )* 1000)
24
+ return s
25
+ def preprocess(audio):
26
+ t1 = 0 * 1000
27
+ t2 = 20 * 60 * 1000
28
+ newAudio = AudioSegment.from_wav(audio)
29
+ a = newAudio[t1:t2]
30
+ spacermilli = 2000
31
+ spacer = AudioSegment.silent(duration=spacermilli)
32
+ newAudio = spacer.append(a, crossfade=0)
33
+ newAudio.export(audio, format="wav")
34
+ return spacermilli, spacer
35
+ def diarization(audio):
36
+ as_audio = AudioSegment.from_wav(audio)
37
+ DEMO_FILE = {'uri': 'blabal', 'audio': audio}
38
+ dz = pipeline(DEMO_FILE)
39
+ with open(f"diarization_{audio}.txt", "w") as text_file:
40
+ text_file.write(str(dz))
41
+ dz = open(f"diarization_{audio}.txt").read().splitlines()
42
+ dzList = []
43
+ for l in dz:
44
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
45
+ start = millisec(start)
46
+ end = millisec(end)
47
+ lex = re.findall('(SPEAKER_[0-9][0-9])', string=l)[0]
48
+ dzList.append([start, end, lex])
49
+ sounds = spacer
50
+ segments = []
51
+ dz = open(f"diarization_{audio}.txt").read().splitlines()
52
+ for l in dz:
53
+ start, end = tuple(re.findall('[0-9]+:[0-9]+:[0-9]+\.[0-9]+', string=l))
54
+ start = millisec(start)
55
+ end = millisec(end)
56
+ segments.append(len(sounds))
57
+ sounds = sounds.append(as_audio[start:end], crossfade=0)
58
+ sounds = sounds.append(spacer, crossfade=0)
59
+ sounds.export(f"dz_{audio}.wav", format="wav")
60
+ return f"dz_{audio}.wav", dzList, segments
61
+
62
+ def transcribe(dz_audio):
63
+ model = whisper.load_model("base")
64
+ result = model.transcribe(dz_audio)
65
+ # for _ in result['segments']:
66
+ # print(_['start'], _['end'], _['text'])
67
+ captions = [[((caption["start"]*1000)), ((caption["end"]*1000)), caption["text"]] for caption in result['segments']]
68
+ conversation = []
69
+ for i in range(len(segments)):
70
+ idx = 0
71
+ for idx in range(len(captions)):
72
+ if captions[idx][0] >= (segments[i] - spacermilli):
73
+ break;
74
+
75
+ while (idx < (len(captions))) and ((i == len(segments) - 1) or (captions[idx][1] < segments[i+1])):
76
+ c = captions[idx]
77
+ start = dzList[i][0] + (c[0] -segments[i])
78
+ if start < 0:
79
+ start = 0
80
+ idx += 1
81
+ if not len(conversation):
82
+ conversation.append([dzList[i][2], c[2]])
83
+ elif conversation[-1][0] == dzList[i][2]:
84
+ conversation[-1][1] += c[2]
85
+ else:
86
+ conversation.append([dzList[i][2], c[2]])
87
+ #print(f"[{dzList[i][2]}] {c[2]}")
88
+ return ("".join([f"{speaker} --> {text}\n" for speaker, text in conversation]))
89
+
90
+ spacermilli, spacer = preprocess(audio)
91
+ dz_audio, dzList, segments = diarization(audio)
92
+ t_text = transcribe(dz_audio)
93
+ try:
94
+ os.remove("temp_audio.wav")
95
+ except OSError:
96
+ pass
97
+ try:
98
+ os.remove("dz_temp_audio.wav")
99
+ except OSError:
100
+ pass
101
+ try:
102
+ os.remove(f"diarization_{audio}.txt")
103
+ except OSError:
104
+ pass
105
+ return t_text
106
+
107
+ def VideoTranscribe(video):
108
+ command = f"ffmpeg -i {video} -ab 160k -ac 2 -ar 44100 -vn temp_audio.wav"
109
+ subprocess.call(command, shell=True)
110
+ return Transcribe()
111
+
112
+ def YoutubeTranscribe(url):
113
+ try:
114
+ os.remove("temp_audio.wav")
115
+ except OSError:
116
+ pass
117
+ ydl_opts = {
118
+ 'format': 'bestaudio/best',
119
+ 'outtmpl': 'temp_audio.%(ext)s',
120
+ 'postprocessors': [{
121
+ 'key': 'FFmpegExtractAudio',
122
+ 'preferredcodec': 'wav',
123
+ }],
124
+ }
125
+ try:
126
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
127
+ ydl.download([url])
128
+ except:
129
+ return YoutubeTranscribe(url)
130
+ stream = ffmpeg.input('temp_audio.m4a')
131
+ stream = ffmpeg.output(stream, 'temp_audio.wav')
132
+ try:
133
+ os.remove("temp_audio.m4a")
134
+ except OSError:
135
+ pass
136
+ return Transcribe()
137
+
138
+ with gr.Blocks() as i:
139
+ video = gr.Video()
140
+ text = gr.Textbox("Input")
141
+ if not video and not text:
142
+ raise Exception("Either input url or video (not both)")
143
+ output = gr.Textbox("Output")
144
+ btn = gr.Button("Run")
145
+ btn.click(fn=YoutubeTranscribe, inputs=text, outputs=output)
146
+ i.launch()
147
+ # YoutubeTranscribe('https://www.youtube.com/watch?v=GECcjrYHH8w')