ktangri commited on
Commit
c150302
1 Parent(s): eb6ba59

Adding speaker segmentation

Browse files
Files changed (2) hide show
  1. app.py +24 -5
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,11 +1,13 @@
1
  import gradio as gr
2
  from transformers import pipeline, Wav2Vec2ProcessorWithLM
 
3
  from librosa import load, resample
4
  from rpunct import RestorePuncts
5
 
6
  asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm'
7
  processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
8
  asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
 
9
 
10
  rpunct = RestorePuncts()
11
 
@@ -13,19 +15,36 @@ def transcribe(filepath):
13
  speech, sampling_rate = load(filepath)
14
  if sampling_rate != 16000:
15
  speech = resample(speech, sampling_rate, 16000)
16
- text = asr(speech)['text']
17
- text = rpunct.punctuate(text.lower())
18
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)
21
 
22
- transcript = gr.outputs.Textbox(type='auto', label='Transcription')
 
23
 
24
  iface = gr.Interface(
25
  theme='huggingface',
26
  description='Testing transcription',
27
  fn=transcribe,
28
  inputs=[mic],
29
- outputs=[transcript]
30
  )
31
  iface.launch()
 
1
  import gradio as gr
2
  from transformers import pipeline, Wav2Vec2ProcessorWithLM
3
+ from pyannote.audio import Pipeline
4
  from librosa import load, resample
5
  from rpunct import RestorePuncts
6
 
7
  asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm'
8
  processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
9
  asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
10
+ speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-segmentation")
11
 
12
  rpunct = RestorePuncts()
13
 
 
15
  speech, sampling_rate = load(filepath)
16
  if sampling_rate != 16000:
17
  speech = resample(speech, sampling_rate, 16000)
18
+ speaker_output = speaker_segmentation(speech)
19
+ text = asr(speech, return_timestamps="word")
20
+
21
+ full_text = text['text'].lower()
22
+ chunks = text['chunks']
23
+
24
+ diarizaed_output = ""
25
+ i = 0
26
+ for turn, _, speaker in speaker_output.itertracks(yield_label=True):
27
+ diarized = ""
28
+ while i < len(chunks) and chunks[i]['timestamp'][1] <= turn.end:
29
+ diarized += chunks[i]['text'].lower() + ' '
30
+ i += 1
31
+
32
+ if diarized != "":
33
+ diarized = rpunct.punctuate(diarized)
34
+ diarized_output += "{}: ''{}'' from {:.3f}-{:.3f}\n".format(speaker,diarized,turn.start,turn.end)
35
+
36
+ return diarizaed_output, full_text
37
 
38
  mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)
39
 
40
+ diarized_transcript = gr.outputs.Textbox(type='auto', label='Diarized Output')
41
+ full_transcript = gr.outputs.Textbox(type='auto', label='Full Transcript')
42
 
43
  iface = gr.Interface(
44
  theme='huggingface',
45
  description='Testing transcription',
46
  fn=transcribe,
47
  inputs=[mic],
48
+ outputs=[diarized_transcript, full_transcript]
49
  )
50
  iface.launch()
requirements.txt CHANGED
@@ -4,3 +4,4 @@ librosa
4
  pyctcdecode
5
  pypi-kenlm
6
  git+https://github.com/anuragshas/rpunct.git
 
 
4
  pyctcdecode
5
  pypi-kenlm
6
  git+https://github.com/anuragshas/rpunct.git
7
+ https://github.com/pyannote/pyannote-audio/archive/develop.zip