peb-peb commited on
Commit
218b27d
1 Parent(s): a65e425

add UI interface

Browse files
Files changed (1) hide show
  1. app.py +138 -90
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import whisper
2
  import gradio as gr
3
  import datetime
4
 
@@ -6,96 +6,144 @@ import subprocess
6
  import wave
7
  import contextlib
8
 
9
- import torch
10
- import pyannote.audio
11
- from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
12
- from pyannote.audio import Audio
13
- from pyannote.core import Segment
14
- from sklearn.cluster import AgglomerativeClustering
15
- import numpy as np
16
-
17
- model = whisper.load_model("large-v2")
18
- embedding_model = PretrainedSpeakerEmbedding(
19
- "speechbrain/spkrec-ecapa-voxceleb",
20
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
- )
22
-
23
- def transcribe(audio, num_speakers):
24
- path, error = convert_to_wav(audio)
25
- if error is not None:
26
- return error
27
-
28
- duration = get_duration(path)
29
- if duration > 4 * 60 * 60:
30
- return "Audio duration too long"
31
-
32
- result = model.transcribe(path)
33
- segments = result["segments"]
34
-
35
- num_speakers = min(max(round(num_speakers), 1), len(segments))
36
- if len(segments) == 1:
37
- segments[0]['speaker'] = 'SPEAKER 1'
38
- else:
39
- embeddings = make_embeddings(path, segments, duration)
40
- add_speaker_labels(segments, embeddings, num_speakers)
41
- output = get_output(segments)
42
- return output
43
-
44
- def convert_to_wav(path):
45
- if path[-3:] != 'wav':
46
- new_path = '.'.join(path.split('.')[:-1]) + '.wav'
47
- try:
48
- subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
49
- except:
50
- return path, 'Error: Could not convert file to .wav'
51
- path = new_path
52
- return path, None
53
-
54
- def get_duration(path):
55
- with contextlib.closing(wave.open(path,'r')) as f:
56
- frames = f.getnframes()
57
- rate = f.getframerate()
58
- return frames / float(rate)
59
-
60
- def make_embeddings(path, segments, duration):
61
- embeddings = np.zeros(shape=(len(segments), 192))
62
- for i, segment in enumerate(segments):
63
- embeddings[i] = segment_embedding(path, segment, duration)
64
- return np.nan_to_num(embeddings)
65
-
66
- audio = Audio()
67
-
68
- def segment_embedding(path, segment, duration):
69
- start = segment["start"]
70
- # Whisper overshoots the end timestamp in the last segment
71
- end = min(duration, segment["end"])
72
- clip = Segment(start, end)
73
- waveform, sample_rate = audio.crop(path, clip)
74
- return embedding_model(waveform[None])
75
-
76
- def add_speaker_labels(segments, embeddings, num_speakers):
77
- clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
78
- labels = clustering.labels_
79
- for i in range(len(segments)):
80
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
81
-
82
- def time(secs):
83
- return datetime.timedelta(seconds=round(secs))
84
-
85
- def get_output(segments):
86
- output = ''
87
- for (i, segment) in enumerate(segments):
88
- if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
89
- if i != 0:
90
- output += '\n\n'
91
- output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
92
- output += segment["text"][1:] + ' '
93
- return output
94
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
  with gr.Blocks() as demo:
97
- with gr.Box():
98
- gr.Textbox(label="First")
99
- gr.Textbox(label="Last")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  demo.launch()
 
1
+ # import whisper
2
  import gradio as gr
3
  import datetime
4
 
 
6
  import wave
7
  import contextlib
8
 
9
+ # import torch
10
+ # import pyannote.audio
11
+ # from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
12
+ # from pyannote.audio import Audio
13
+ # from pyannote.core import Segment
14
+ # from sklearn.cluster import AgglomerativeClustering
15
+ # import numpy as np
16
+
17
+ # model = whisper.load_model("large-v2")
18
+ # embedding_model = PretrainedSpeakerEmbedding(
19
+ # "speechbrain/spkrec-ecapa-voxceleb",
20
+ # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+ # )
22
+
23
+ # def transcribe(audio, num_speakers):
24
+ # path, error = convert_to_wav(audio)
25
+ # if error is not None:
26
+ # return error
27
+
28
+ # duration = get_duration(path)
29
+ # if duration > 4 * 60 * 60:
30
+ # return "Audio duration too long"
31
+
32
+ # result = model.transcribe(path)
33
+ # segments = result["segments"]
34
+
35
+ # num_speakers = min(max(round(num_speakers), 1), len(segments))
36
+ # if len(segments) == 1:
37
+ # segments[0]['speaker'] = 'SPEAKER 1'
38
+ # else:
39
+ # embeddings = make_embeddings(path, segments, duration)
40
+ # add_speaker_labels(segments, embeddings, num_speakers)
41
+ # output = get_output(segments)
42
+ # return output
43
+
44
+ # def convert_to_wav(path):
45
+ # if path[-3:] != 'wav':
46
+ # new_path = '.'.join(path.split('.')[:-1]) + '.wav'
47
+ # try:
48
+ # subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
49
+ # except:
50
+ # return path, 'Error: Could not convert file to .wav'
51
+ # path = new_path
52
+ # return path, None
53
+
54
+ # def get_duration(path):
55
+ # with contextlib.closing(wave.open(path,'r')) as f:
56
+ # frames = f.getnframes()
57
+ # rate = f.getframerate()
58
+ # return frames / float(rate)
59
+
60
+ # def make_embeddings(path, segments, duration):
61
+ # embeddings = np.zeros(shape=(len(segments), 192))
62
+ # for i, segment in enumerate(segments):
63
+ # embeddings[i] = segment_embedding(path, segment, duration)
64
+ # return np.nan_to_num(embeddings)
65
+
66
+ # audio = Audio()
67
+
68
+ # def segment_embedding(path, segment, duration):
69
+ # start = segment["start"]
70
+ # # Whisper overshoots the end timestamp in the last segment
71
+ # end = min(duration, segment["end"])
72
+ # clip = Segment(start, end)
73
+ # waveform, sample_rate = audio.crop(path, clip)
74
+ # return embedding_model(waveform[None])
75
+
76
+ # def add_speaker_labels(segments, embeddings, num_speakers):
77
+ # clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
78
+ # labels = clustering.labels_
79
+ # for i in range(len(segments)):
80
+ # segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
81
+
82
+ # def time(secs):
83
+ # return datetime.timedelta(seconds=round(secs))
84
+
85
+ # def get_output(segments):
86
+ # output = ''
87
+ # for (i, segment) in enumerate(segments):
88
+ # if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
89
+ # if i != 0:
90
+ # output += '\n\n'
91
+ # output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
92
+ # output += segment["text"][1:] + ' '
93
+ # return output
94
+
95
+ s = ""
96
+
97
+ def greet1(name):
98
+ global s
99
+ s = "modified"
100
+ return "Hello " + name + "!"
101
+
102
+
103
+ def greet2(name):
104
+ return "Hi " + name + "!" + " " + s
105
+
106
+
107
+ def greet3(name):
108
+ return "Hola " + name + "!"
109
 
110
  with gr.Blocks() as demo:
111
+ with gr.Row():
112
+ with gr.Column():
113
+ audio_file = gr.UploadButton(label="Upload a Audio file (.wav)")
114
+ # name = gr.Textbox(label="Name", placeholder="Name") # TODO: remove
115
+ number_of_speakers = gr.Number(label="Number of Speakers", value=2)
116
+ with gr.Row():
117
+ btn_clear = gr.Button(value="Clear")
118
+ btn_submit = gr.Button(value="Submit")
119
+ with gr.Column():
120
+ title = gr.Textbox(label="Title", placeholder="Title for Conversation")
121
+ short_summary = gr.Textbox(label="Short Summary", placeholder="Short Summary for Conversation")
122
+ sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation")
123
+ quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation")
124
+ detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation")
125
+ gr.Markdown("## Examples")
126
+ gr.Examples(
127
+ examples=[
128
+ [
129
+ "Harsh",
130
+ 2,
131
+ ],
132
+ [
133
+ "Rahul",
134
+ 2,
135
+ ],
136
+ ],
137
+ inputs=[title],
138
+ outputs=[short_summary],
139
+ fn=greet1,
140
+ cache_examples=True,
141
+ )
142
+ gr.Markdown(
143
+ """
144
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
145
+ for more details.
146
+ """
147
+ )
148
 
149
  demo.launch()