vumichien commited on
Commit
84c7470
1 Parent(s): eea75ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -9
app.py CHANGED
@@ -17,6 +17,8 @@ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbed
17
  from pyannote.audio import Audio
18
  from pyannote.core import Segment
19
 
 
 
20
  import wave
21
  import contextlib
22
 
@@ -137,7 +139,7 @@ print("DEVICE IS: ")
137
  print(device)
138
 
139
 
140
- def time(secs):
141
  return datetime.timedelta(seconds=round(secs))
142
 
143
  def get_youtube(video_url):
@@ -161,6 +163,7 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
161
  """
162
 
163
  model = whisper.load_model(whisper_model)
 
164
  if(video_file_path == None):
165
  raise ValueError("Error no video input")
166
  print(video_file_path)
@@ -222,17 +225,29 @@ def speech_to_text(video_file_path, selected_source_lang, whisper_model, num_spe
222
  text = ''
223
  for (i, segment) in enumerate(segments):
224
  if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
225
- objects['Start'].append(str(time(segment["start"])))
226
  objects['Speaker'].append(segment["speaker"])
227
  if i != 0:
228
- objects['End'].append(str(time(segments[i - 1]["end"])))
229
  objects['Text'].append(text)
230
  text = ''
231
  text += segment["text"] + ' '
232
- objects['End'].append(str(time(segments[i - 1]["end"])))
233
  objects['Text'].append(text)
234
 
235
- return pd.DataFrame(objects)
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  except Exception as e:
238
  raise RuntimeError("Error Running inference with local model", e)
@@ -266,13 +281,13 @@ with demo:
266
  memory = psutil.virtual_memory()
267
 
268
  with gr.Row():
269
- gr.Markdown(f'''
270
  ### This space allows you to:
271
  ##### 1. Download youtube video with a given URL
272
  ##### 2. Watch it in the first video component
273
  ##### 3. Run automatic speech recognition and diarization (speaker identification)
274
- *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*
275
  ''')
 
276
 
277
  with gr.Row():
278
  gr.Markdown('''
@@ -307,7 +322,7 @@ with demo:
307
  selected_whisper_model.render()
308
  number_speakers.render()
309
  transcribe_btn = gr.Button("Transcribe audio and diarization")
310
- transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], transcription_df)
311
 
312
 
313
  with gr.Row():
@@ -319,4 +334,4 @@ with demo:
319
  with gr.Column():
320
  transcription_df.render()
321
 
322
- demo.launch(debug=True)
 
17
  from pyannote.audio import Audio
18
  from pyannote.core import Segment
19
 
20
+ from gpuinfo import GPUInfo
21
+
22
  import wave
23
  import contextlib
24
 
 
139
  print(device)
140
 
141
 
142
+ def convert_time(secs):
143
  return datetime.timedelta(seconds=round(secs))
144
 
145
  def get_youtube(video_url):
 
163
  """
164
 
165
  model = whisper.load_model(whisper_model)
166
+ time_start = time.time()
167
  if(video_file_path == None):
168
  raise ValueError("Error no video input")
169
  print(video_file_path)
 
225
  text = ''
226
  for (i, segment) in enumerate(segments):
227
  if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
228
+ objects['Start'].append(str(convert_time(segment["start"])))
229
  objects['Speaker'].append(segment["speaker"])
230
  if i != 0:
231
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
232
  objects['Text'].append(text)
233
  text = ''
234
  text += segment["text"] + ' '
235
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
236
  objects['Text'].append(text)
237
 
238
+ time_end = time.time()
239
+ time_diff = time_end - time_start
240
+ memory = psutil.virtual_memory()
241
+ gpu_utilization, gpu_memory = GPUInfo.gpu_usage()
242
+ gpu_utilization = gpu_utilization[0] if len(gpu_utilization) > 0 else 0
243
+ gpu_memory = gpu_memory[0] if len(gpu_memory) > 0 else 0
244
+ system_info = f"""
245
+ *Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB.*
246
+ *Processing time: {time_diff:.5} seconds.*
247
+ *GPU Utilization: {gpu_utilization}%, GPU Memory: {gpu_memory}MiB.*
248
+ """
249
+
250
+ return pd.DataFrame(objects), system_info
251
 
252
  except Exception as e:
253
  raise RuntimeError("Error Running inference with local model", e)
 
281
  memory = psutil.virtual_memory()
282
 
283
  with gr.Row():
284
+ gr.Markdown('''
285
  ### This space allows you to:
286
  ##### 1. Download youtube video with a given URL
287
  ##### 2. Watch it in the first video component
288
  ##### 3. Run automatic speech recognition and diarization (speaker identification)
 
289
  ''')
290
+ system_info = gr.Markdown(f"*Memory: {memory.total / (1024 * 1024 * 1024):.2f}GB, used: {memory.percent}%, available: {memory.available / (1024 * 1024 * 1024):.2f}GB*")
291
 
292
  with gr.Row():
293
  gr.Markdown('''
 
322
  selected_whisper_model.render()
323
  number_speakers.render()
324
  transcribe_btn = gr.Button("Transcribe audio and diarization")
325
+ transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model, number_speakers], [transcription_df, system_info])
326
 
327
 
328
  with gr.Row():
 
334
  with gr.Column():
335
  transcription_df.render()
336
 
337
+ demo.launch(debug=True, share=True)