Spaces:
Runtime error
Runtime error
ktangri
commited on
Commit
•
c150302
1
Parent(s):
eb6ba59
Adding speaker segmentation
Browse files- app.py +24 -5
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline, Wav2Vec2ProcessorWithLM
|
|
|
3 |
from librosa import load, resample
|
4 |
from rpunct import RestorePuncts
|
5 |
|
6 |
asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm'
|
7 |
processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
|
8 |
asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
|
|
|
9 |
|
10 |
rpunct = RestorePuncts()
|
11 |
|
@@ -13,19 +15,36 @@ def transcribe(filepath):
|
|
13 |
speech, sampling_rate = load(filepath)
|
14 |
if sampling_rate != 16000:
|
15 |
speech = resample(speech, sampling_rate, 16000)
|
16 |
-
|
17 |
-
text =
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)
|
21 |
|
22 |
-
|
|
|
23 |
|
24 |
iface = gr.Interface(
|
25 |
theme='huggingface',
|
26 |
description='Testing transcription',
|
27 |
fn=transcribe,
|
28 |
inputs=[mic],
|
29 |
-
outputs=[
|
30 |
)
|
31 |
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import pipeline, Wav2Vec2ProcessorWithLM
|
3 |
+
from pyannote.audio import Pipeline
|
4 |
from librosa import load, resample
|
5 |
from rpunct import RestorePuncts
|
6 |
|
7 |
asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm'
|
8 |
processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
|
9 |
asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
|
10 |
+
speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-segmentation")
|
11 |
|
12 |
rpunct = RestorePuncts()
|
13 |
|
|
|
15 |
speech, sampling_rate = load(filepath)
|
16 |
if sampling_rate != 16000:
|
17 |
speech = resample(speech, sampling_rate, 16000)
|
18 |
+
speaker_output = speaker_segmentation(speech)
|
19 |
+
text = asr(speech, return_timestamps="word")
|
20 |
+
|
21 |
+
full_text = text['text'].lower()
|
22 |
+
chunks = text['chunks']
|
23 |
+
|
24 |
+
diarizaed_output = ""
|
25 |
+
i = 0
|
26 |
+
for turn, _, speaker in speaker_output.itertracks(yield_label=True):
|
27 |
+
diarized = ""
|
28 |
+
while i < len(chunks) and chunks[i]['timestamp'][1] <= turn.end:
|
29 |
+
diarized += chunks[i]['text'].lower() + ' '
|
30 |
+
i += 1
|
31 |
+
|
32 |
+
if diarized != "":
|
33 |
+
diarized = rpunct.punctuate(diarized)
|
34 |
+
diarized_output += "{}: ''{}'' from {:.3f}-{:.3f}\n".format(speaker,diarized,turn.start,turn.end)
|
35 |
+
|
36 |
+
return diarizaed_output, full_text
|
37 |
|
38 |
mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)
|
39 |
|
40 |
+
diarized_transcript = gr.outputs.Textbox(type='auto', label='Diarized Output')
|
41 |
+
full_transcript = gr.outputs.Textbox(type='auto', label='Full Transcript')
|
42 |
|
43 |
iface = gr.Interface(
|
44 |
theme='huggingface',
|
45 |
description='Testing transcription',
|
46 |
fn=transcribe,
|
47 |
inputs=[mic],
|
48 |
+
outputs=[diarized_transcript, full_transcript]
|
49 |
)
|
50 |
iface.launch()
|
requirements.txt
CHANGED
@@ -4,3 +4,4 @@ librosa
|
|
4 |
pyctcdecode
|
5 |
pypi-kenlm
|
6 |
git+https://github.com/anuragshas/rpunct.git
|
|
|
|
4 |
pyctcdecode
|
5 |
pypi-kenlm
|
6 |
git+https://github.com/anuragshas/rpunct.git
|
7 |
+
https://github.com/pyannote/pyannote-audio/archive/develop.zip
|