SoulAbi commited on
Commit
e9be38f
1 Parent(s): caaf1ec

update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -103
app.py CHANGED
@@ -1,22 +1,5 @@
1
- from google.colab import files
2
- uploaded = files.upload()
3
- path = next(iter(uploaded))
4
-
5
- num_speakers = 2 #@param {type:"integer"}
6
-
7
- language = 'English' #@param ['any', 'English']
8
-
9
- model_size = 'large' #@param ['tiny', 'base', 'small', 'medium', 'large']
10
-
11
-
12
- model_name = model_size
13
- if language == 'English' and model_size != 'tiny':
14
- model_name += '.en'
15
-
16
- !pip install -q git+https://github.com/openai/whisper.git > /dev/null
17
- !pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null
18
-
19
  import whisper
 
20
  import datetime
21
 
22
  import subprocess
@@ -24,9 +7,6 @@ import subprocess
24
  import torch
25
  import pyannote.audio
26
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
27
- embedding_model = PretrainedSpeakerEmbedding(
28
- "speechbrain/spkrec-ecapa-voxceleb",
29
- device=torch.device("cuda"))
30
 
31
  from pyannote.audio import Audio
32
  from pyannote.core import Segment
@@ -37,98 +17,93 @@ import contextlib
37
  from sklearn.cluster import AgglomerativeClustering
38
  import numpy as np
39
 
40
-
41
- if path[-3:] != 'wav':
42
- subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
43
- path = 'audio.wav'
44
-
45
- model = whisper.load_model(model_size)
46
-
47
- result = model.transcribe(path)
48
- segments = result["segments"]
49
-
50
- with contextlib.closing(wave.open(path,'r')) as f:
51
- frames = f.getnframes()
52
- rate = f.getframerate()
53
- duration = frames / float(rate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  audio = Audio()
56
 
57
- def segment_embedding(segment):
58
  start = segment["start"]
59
-
60
  end = min(duration, segment["end"])
61
  clip = Segment(start, end)
62
  waveform, sample_rate = audio.crop(path, clip)
63
  return embedding_model(waveform[None])
64
 
65
- embeddings = np.zeros(shape=(len(segments), 192))
66
- for i, segment in enumerate(segments):
67
- embeddings[i] = segment_embedding(segment)
68
-
69
- embeddings = np.nan_to_num(embeddings)
70
-
71
- clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
72
- labels = clustering.labels_
73
- for i in range(len(segments)):
74
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
75
- # speaker = 'Held'
76
- # speaker = 'Heldisha'
77
- # if segments[i]["speaker"]== 'SPEAKER 1':
78
- # segments[i]["speaker"] = 'Held'
79
- # elif segments[i]["speaker"]== 'SPEAKER 2':
80
- # segments[i]["speaker"] = 'Heldisha'
81
- # if segments[i]["speaker"]== 'SPEAKER 1':
82
- # segments[i]["speaker"] = segments.index('n')
83
- # k = list(segments)
84
- # print(k[5])
85
 
86
  def time(secs):
87
  return datetime.timedelta(seconds=round(secs))
88
 
89
- f = open("transcript.txt", "w")
90
-
91
- for (i, segment) in enumerate(segments):
92
- if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
93
- f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
94
- f.write(segment["text"][1:] + ' ')
95
- f.close()
96
-
97
- # with open('transcript.txt', 'r') as file:
98
- # text = file.read()
99
- # words = text.split()
100
- # i = words.index('name')
101
- # if (words[i-1] == 'My') or (words[i-1] == 'my') and (words[i+1] == 'is'):
102
- # name1 = words[i+2]
103
- # print(name1)
104
-
105
- # with open('transcript.txt', 'r') as file:
106
- # text = file.read()
107
- # new_text = text.replace('SPEAKER 1', name1)
108
- # with open('transcript.txt', 'w') as file:
109
- # file.write(new_text)
110
-
111
-
112
-
113
- # with open('transcript.txt', 'r') as file:
114
-
115
- # text = file.read()
116
- # words = text.split()
117
- # i = words.index('name')
118
- # if (words[i+3] == 'What') or (1<2) and (words[i+1] == 'is') or 1<2:
119
- # name2 = words[i+22]
120
- # print(name2)
121
- # with open('transcript.txt', 'r') as file:
122
- # text = file.read()
123
- # new_text = text.replace('SPEAKER 2', name2)
124
- # with open('transcript.txt', 'w') as file:
125
- # file.write(new_text)
126
-
127
-
128
-
129
-
130
-
131
-
132
-
133
-
134
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import whisper
2
+ import gradio as gr
3
  import datetime
4
 
5
  import subprocess
 
7
  import torch
8
  import pyannote.audio
9
  from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
 
 
 
10
 
11
  from pyannote.audio import Audio
12
  from pyannote.core import Segment
 
17
  from sklearn.cluster import AgglomerativeClustering
18
  import numpy as np
19
 
20
+ model = whisper.load_model("large-v2")
21
+ embedding_model = PretrainedSpeakerEmbedding(
22
+ "speechbrain/spkrec-ecapa-voxceleb",
23
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
24
+ )
25
+
26
+ def transcribe(audio, num_speakers):
27
+ path, error = convert_to_wav(audio)
28
+ if error is not None:
29
+ return error
30
+
31
+ duration = get_duration(path)
32
+ if duration > 4 * 60 * 60:
33
+ return "Audio duration too long"
34
+
35
+ result = model.transcribe(path)
36
+ segments = result["segments"]
37
+
38
+ num_speakers = min(max(round(num_speakers), 1), len(segments))
39
+ if len(segments) == 1:
40
+ segments[0]['speaker'] = 'SPEAKER 1'
41
+ else:
42
+ embeddings = make_embeddings(path, segments, duration)
43
+ add_speaker_labels(segments, embeddings, num_speakers)
44
+ output = get_output(segments)
45
+ return output
46
+
47
+ def convert_to_wav(path):
48
+ if path[-3:] != 'wav':
49
+ new_path = '.'.join(path.split('.')[:-1]) + '.wav'
50
+ try:
51
+ subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
52
+ except:
53
+ return path, 'Error: Could not convert file to .wav'
54
+ path = new_path
55
+ return path, None
56
+
57
+ def get_duration(path):
58
+ with contextlib.closing(wave.open(path,'r')) as f:
59
+ frames = f.getnframes()
60
+ rate = f.getframerate()
61
+ return frames / float(rate)
62
+
63
+ def make_embeddings(path, segments, duration):
64
+ embeddings = np.zeros(shape=(len(segments), 192))
65
+ for i, segment in enumerate(segments):
66
+ embeddings[i] = segment_embedding(path, segment, duration)
67
+ return np.nan_to_num(embeddings)
68
 
69
  audio = Audio()
70
 
71
+ def segment_embedding(path, segment, duration):
72
  start = segment["start"]
73
+ # Whisper overshoots the end timestamp in the last segment
74
  end = min(duration, segment["end"])
75
  clip = Segment(start, end)
76
  waveform, sample_rate = audio.crop(path, clip)
77
  return embedding_model(waveform[None])
78
 
79
+ def add_speaker_labels(segments, embeddings, num_speakers):
80
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
81
+ labels = clustering.labels_
82
+ for i in range(len(segments)):
83
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  def time(secs):
86
  return datetime.timedelta(seconds=round(secs))
87
 
88
+ def get_output(segments):
89
+ output = ''
90
+ for (i, segment) in enumerate(segments):
91
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
92
+ if i != 0:
93
+ output += '\n\n'
94
+ output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
95
+ output += segment["text"][1:] + ' '
96
+ return output
97
+
98
+ gr.Interface(
99
+ title = 'Whisper with Speaker Recognition',
100
+ fn=transcribe,
101
+ inputs=[
102
+ gr.inputs.Audio(source="upload", type="filepath"),
103
+ gr.inputs.Number(default=2, label="Number of Speakers")
104
+
105
+ ],
106
+ outputs=[
107
+ gr.outputs.Textbox(label='Transcript')
108
+ ]
109
+ ).launch()