File size: 5,759 Bytes
763f6d6
c153aa4
 
 
 
 
 
 
 
 
 
7c5e2f5
c153aa4
8e06021
d1b9092
c153aa4
 
 
 
93471da
73b9216
2d076c6
 
7104546
d7511d4
2d076c6
08aba45
 
 
 
 
 
bb78308
08aba45
 
9324d2a
3680dfd
c153aa4
6a2bf53
 
94c77bb
9324d2a
 
 
 
 
08aba45
9324d2a
6a2bf53
 
 
 
 
 
b81d6e7
a3ee5fc
6a2bf53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d23a170
c153aa4
 
763f6d6
08aba45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
import torch
import io
import base64
import numpy as np
import scipy.io.wavfile
from typing import Text
from pyannote.audio import Pipeline
from pyannote.audio import Audio
from pyannote.core import Segment
import gradio as gr
import os



import yt_dlp as youtube_dl
from gradio_client import Client
from transformers.pipelines.audio_utils import ffmpeg_read

HF_TOKEN = os.environ.get("HF_TOKEN")

# set up the diarization pipeline
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token=HF_TOKEN)
if torch.cuda.is_available():
 diarization_pipeline.to(torch.device("cuda"))


import gradio as gr


def transcribe(audio_path):
    # Run diarization while we wait for Whisper JAX
    diarization = diarization_pipeline(audio_path)
    # Segments = diarization.for_json()["content"]
    # Segments = str(diarization)
    transcription = "SAML Output"
    return diarization



title = "SAML Speaker Diarization ⚡️ "

description = """Combine the speed of Whisper JAX with pyannote speaker diarization to transcribe meetings in super fast time. Demo uses Whisper JAX as an [endpoint](https://twitter.com/sanchitgandhi99/status/1656665496463495168) and pyannote speaker diarization running locally. The Whisper JAX endpoint is run asynchronously, meaning speaker diarization is run in parallel to the speech transcription. The diarized timestamps are aligned with the Whisper output to give the final speaker-segmented transcription.
To duplicate the demo, first accept the pyannote terms of use for the [speaker diarization](https://huggingface.co/pyannote/speaker-diarization) and [segmentation](https://huggingface.co/pyannote/segmentation) models. Then, click [here](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax-diarization?duplicate=true) to duplicate the demo, and enter your Hugging Face access token as a Space secret when prompted.
"""

article = """Whisper large-v2 model by OpenAI. Speaker diarization model by pyannote. Whisper JAX backend running JAX on a TPU v4-8 through the generous support of the [TRC](https://sites.research.google/trc/about/) programme. Whisper JAX [code](https://github.com/sanchit-gandhi/whisper-jax) and Gradio demo by 🤗 Hugging Face."""


import gradio as gr

def greet(name):
    return "Hello " + name + "!!"

iface = gr.Interface(fn=transcribe, inputs=gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"), outputs="text")
iface.launch(show_api=True)







# audio_file = gr.Interface(
#     fn=transcribe,
#     inputs=[
#         gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
#         # gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
#         # gr.inputs.Checkbox(default=True, label="Group by speaker"),
#     ],
#     outputs=[
#         gr.outputs.Textbox(label="Transcription").style(show_copy_button=True)
#     ],
#     allow_flagging="auto",
#     title=title,
#     description=description,
#     article=article,
# )

# demo = gr.Blocks()
# with demo:
#     gr.TabbedInterface([audio_file], ["Audio File"])

# demo.launch()





# def transcribe(audio_path, task="transcribe", group_by_speaker=True, progress=gr.Progress()):
    
#     # run diarization while we wait for Whisper JAX
#     progress(0, desc="Diarizing...")
#     diarization = diarization_pipeline(audio_path)
#     print(diarization)
#     #segments = diarization.for_json()["content"]
#     #segments = str(diarization) 
#     transcription = "SAML Output"
#     return transcription

# title = "SAML Speaker Diarization ⚡️"

# description = """Combine the speed of Whisper JAX with pyannote speaker diarization to transcribe meetings in super fast time. Demo uses Whisper JAX as an [endpoint](https://twitter.com/sanchitgandhi99/status/1656665496463495168) and pyannote speaker diarization running locally. The Whisper JAX endpoint is run asynchronously, meaning speaker diarization is run in parallel to the speech transcription. The diarized timestamps are aligned with the Whisper output to give the final speaker-segmented transcription.
# To duplicate the demo, first accept the pyannote terms of use for the [speaker diarization](https://huggingface.co/pyannote/speaker-diarization) and [segmentation](https://huggingface.co/pyannote/segmentation) models. Then, click [here](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax-diarization?duplicate=true) to duplicate the demo, and enter your Hugging Face access token as a Space secret when prompted.
# """

# article = "Whisper large-v2 model by OpenAI. Speaker diarization model by pyannote. Whisper JAX backend running JAX on a TPU v4-8 through the generous support of the [TRC](https://sites.research.google/trc/about/) programme. Whisper JAX [code](https://github.com/sanchit-gandhi/whisper-jax) and Gradio demo by 🤗 Hugging Face."


# audio_file = gr.Interface(
#     fn=transcribe,
#     inputs=[
#         gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
#         gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
#         gr.inputs.Checkbox(default=True, label="Group by speaker"),
#     ],
#     outputs=[
#         gr.outputs.Textbox(label="Transcription").style(show_copy_button=True)
#     ],
#     allow_flagging="never",
#     title=title,
#     description=description,
#     article=article,
# )

# demo = gr.Blocks()
# with demo:
#     gr.TabbedInterface([ audio_file], [ "Audio File"])

# demo.queue(max_size=10)
# demo.launch(show_api=True)


# # def greet(name):
# #     return "Hello " + name + "!!"

# # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
# # iface.launch()