File size: 3,066 Bytes
3f44072
de18ebe
9e20b12
 
 
 
d78cd77
 
 
9e20b12
f59034e
3f44072
f59034e
 
d78cd77
 
 
 
 
 
 
 
 
f59034e
 
d78cd77
f59034e
 
 
d78cd77
 
9e20b12
f59034e
 
 
 
 
 
 
8983ff3
f59034e
 
 
 
 
 
3f44072
f59034e
d78cd77
 
 
9e20b12
 
 
 
d78cd77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gradio as gr
from pyannote.audio import Pipeline
from transformers import pipeline

asr = pipeline(
    "automatic-speech-recognition",
    model="facebook/wav2vec2-large-960h-lv60-self",
    feature_extractor="facebook/wav2vec2-large-960h-lv60-self",
    
)
speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-segmentation")

def segmentation(audio):
    speaker_output = speaker_segmentation(audio)
    text_output = asr(audio,return_timestamps="word")
    
    full_text = text_output['text'].lower()
    chunks = text_output['chunks']

    diarized_output = ""
    i = 0
    for turn, _, speaker in speaker_output.itertracks(yield_label=True):
        diarized = ""
        while i < len(chunks) and chunks[i]['timestamp'][1] <= turn.end:
            diarized += chunks[i]['text'].lower() + ' '
            i += 1
        
        if diarized != "":
            diarized_output += "{}: ''{}'' from {:.3f}-{:.3f}\n".format(speaker,diarized,turn.start,turn.end)
        
    return diarized_output, full_text

title = "Speech Recognition with Speaker Segmentation"
description = "Speaker Diarization is the act of attributing individual speakers to their corresponding parts in an audio recording. This space aims to distinguish the speakers with speaker segmentation and their speech with speech-to-text from a given input audio file. Pre-trained models used are Pyannote[1] for the Speaker Segmentation and Wav2Vec2[2] for the Automatic Speech Recognition."
article = "<p style='text-align: center'><a href='https://github.com/pyannote/pyannote-audio' target='_blank'>[1] Pyannote - Speaker Segmentation model (GitHub repo)</a></p>"
article += "<p style='text-align: center'><a href='https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#wav2vec-20' target='_blank'>[2] Facebook Wav2Vec2 (GitHub repo)</a></p>"
article += "<p style='text-align: center'>Audio File Sources: <a href='https://www.youtube.com/watch?v=DYu_bGbZiiQ&t=132s' target='_blank'>1</a> <a href='https://www.youtube.com/watch?v=DDjWTWHHkpk&t=29s' target='_blank'>2</a> <a href='https://www.youtube.com/watch?v=G2xWg2ckKHI&t=24s' target='_blank'>3</a> <a href='https://www.youtube.com/watch?v=sCcv9uqSBU0&t=32s' target='_blank'>4</a> <a href='https://www.youtube.com/watch?v=K1hlp0DCE_8&t=71s' target='_blank'>5</a></p>"

inputs = gr.inputs.Audio(source="upload", type="filepath", label="Upload your audio file here:")
outputs = [gr.outputs.Textbox(type="auto", label="Diarized Output"),
            gr.outputs.Textbox(type="auto",label="Full ASR Text for comparison")]
examples = [["meeting_audio.wav"],
            ["noisy_london_interview.wav"],
            ["clean_london_interview.wav"],
            ["podcast_audio.wav"],
            ["air_traffic_control_audio.wav"],]

app = gr.Interface(fn=segmentation,
                inputs=inputs,
                outputs=outputs,
                examples=examples,
                title=title,
                description=description,
                article=article,
                allow_flagging=False)
app.launch()