peb-peb commited on
Commit
004f352
1 Parent(s): 5edee8f

add transcribe

Browse files
Files changed (1) hide show
  1. app.py +105 -3
app.py CHANGED
@@ -1,7 +1,109 @@
 
1
  import gradio as gr
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  iface.launch()
 
1
+ import whisper
2
  import gradio as gr
3
+ import datetime
4
 
5
+ import subprocess
6
+ import wave
7
+ import contextlib
8
+
9
+ import torch
10
+ import pyannote.audio
11
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
12
+ from pyannote.audio import Audio
13
+ from pyannote.core import Segment
14
+ from sklearn.cluster import AgglomerativeClustering
15
+ import numpy as np
16
+
17
+ model = whisper.load_model("large-v2")
18
+ embedding_model = PretrainedSpeakerEmbedding(
19
+ "speechbrain/spkrec-ecapa-voxceleb",
20
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+ )
22
+
23
+ def transcribe(audio, num_speakers):
24
+ path, error = convert_to_wav(audio)
25
+ if error is not None:
26
+ return error
27
+
28
+ duration = get_duration(path)
29
+ if duration > 4 * 60 * 60:
30
+ return "Audio duration too long"
31
+
32
+ result = model.transcribe(path)
33
+ segments = result["segments"]
34
+
35
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
36
+ if len(segments) == 1:
37
+ segments[0]['speaker'] = 'SPEAKER 1'
38
+ else:
39
+ embeddings = make_embeddings(path, segments, duration)
40
+ add_speaker_labels(segments, embeddings, num_speakers)
41
+ output = get_output(segments)
42
+ return output
43
+
44
+ def convert_to_wav(path):
45
+ if path[-3:] != 'wav':
46
+ new_path = '.'.join(path.split('.')[:-1]) + '.wav'
47
+ try:
48
+ subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
49
+ except:
50
+ return path, 'Error: Could not convert file to .wav'
51
+ path = new_path
52
+ return path, None
53
+
54
+ def get_duration(path):
55
+ with contextlib.closing(wave.open(path,'r')) as f:
56
+ frames = f.getnframes()
57
+ rate = f.getframerate()
58
+ return frames / float(rate)
59
+
60
+ def make_embeddings(path, segments, duration):
61
+ embeddings = np.zeros(shape=(len(segments), 192))
62
+ for i, segment in enumerate(segments):
63
+ embeddings[i] = segment_embedding(path, segment, duration)
64
+ return np.nan_to_num(embeddings)
65
+
66
+ audio = Audio()
67
+
68
+ def segment_embedding(path, segment, duration):
69
+ start = segment["start"]
70
+ # Whisper overshoots the end timestamp in the last segment
71
+ end = min(duration, segment["end"])
72
+ clip = Segment(start, end)
73
+ waveform, sample_rate = audio.crop(path, clip)
74
+ return embedding_model(waveform[None])
75
+
76
+ def add_speaker_labels(segments, embeddings, num_speakers):
77
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
78
+ labels = clustering.labels_
79
+ for i in range(len(segments)):
80
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
81
+
82
+ def time(secs):
83
+ return datetime.timedelta(seconds=round(secs))
84
+
85
+ def get_output(segments):
86
+ output = ''
87
+ for (i, segment) in enumerate(segments):
88
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
89
+ if i != 0:
90
+ output += '\n\n'
91
+ output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
92
+ output += segment["text"][1:] + ' '
93
+ return output
94
+
95
+
96
+ iface = gr.Interface(
97
+ title = 'Whisper with Speaker Recognition',
98
+ fn=transcribe,
99
+ inputs=[
100
+ gr.inputs.Audio(source="upload", type="filepath"),
101
+ gr.inputs.Number(default=2, label="Number of Speakers")
102
+
103
+ ],
104
+ outputs=[
105
+ gr.outputs.Textbox(label='Transcript')
106
+ ]
107
+ ).launch()
108
 
 
109
  iface.launch()