Cahlil commited on
Commit
9e20b12
1 Parent(s): b18fd6d

update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -8
app.py CHANGED
@@ -1,17 +1,38 @@
1
  import gradio as gr
2
  from pyannote.audio import Pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def diarization(audio):
5
  pipeline = Pipeline.from_pretrained("pyannote/speaker-segmentation")
6
- #parameters = {"onset":0.7,"offset":0.3,"min_duration_on":0.0,"min_duration_off":0.0}
7
- #pipeline.instantiate(parameters)
8
  output = pipeline(audio)
9
- answer = ""
10
  for turn, _, speaker in output.itertracks(yield_label=True):
11
- answer += "{} said something starting from {:.2f} and ends on {:.2f}\n".format(speaker,turn.start,turn.end)
12
- return answer
 
 
 
 
 
13
 
14
  app = gr.Interface(fn=diarization,
15
- inputs=gr.inputs.Audio(source="upload", type="filepath", label="audio"),
16
- outputs="text")
17
- app.launch()
 
 
 
 
 
 
1
  import gradio as gr
2
  from pyannote.audio import Pipeline
3
+ from datasets import load_dataset
4
+ from transformers import pipeline
5
+
6
+ librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
7
+ asr = pipeline(
8
+ "automatic-speech-recognition",
9
+ model="facebook/s2t-wav2vec2-large-en-de",
10
+ feature_extractor="facebook/s2t-wav2vec2-large-en-de",
11
+ )
12
+
13
+ def speech_to_text(audio):
14
+ translation = asr(librispeech_en[0][audio])
15
+ return translation
16
 
17
  def diarization(audio):
18
  pipeline = Pipeline.from_pretrained("pyannote/speaker-segmentation")
 
 
19
  output = pipeline(audio)
20
+ result = ""
21
  for turn, _, speaker in output.itertracks(yield_label=True):
22
+ text_result = speech_to_text(audio)
23
+ result += "{} said '{}' from {:.3f} to {:.3f}\n".format(speaker,text_result,turn.start,turn.end)
24
+ return "No output" if result == "" else result
25
+
26
+ title = "Speech Recognition with Speaker Diarization"
27
+ description = "Speaker Diarization is the act of attributing parts of the audio recording to different speakers. This space aims to distinguish the speakers and apply speech-to-text from a given input audio file. Pre-trained models from Pyannote[1] for the Speaker Diarization and [2]."
28
+ article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Diarization model</a></p>"
29
 
30
  app = gr.Interface(fn=diarization,
31
+ inputs=gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:"),
32
+ outputs=gr.outputs.Textbox(type="auto", label="OUTPUT"),
33
+ examples=[["test_audio1.wav"]],
34
+ title=title,
35
+ description=description,
36
+ article=article,
37
+ allow_flagging=False)
38
+ app.launch(enable_queue=True)