Spaces:
Runtime error
Runtime error
File size: 5,749 Bytes
763f6d6 c153aa4 8e06021 c153aa4 8e06021 2d076c6 7104546 d7511d4 2d076c6 08aba45 f1bf51a 08aba45 9324d2a 3680dfd c153aa4 6a2bf53 9324d2a 08aba45 9324d2a 6a2bf53 b81d6e7 6a2bf53 d23a170 c153aa4 763f6d6 08aba45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import gradio as gr
import torch
import io
import base64
import numpy as np
import scipy.io.wavfile
from typing import Text
from pyannote.audio import Pipeline
from pyannote.audio import Audio
from pyannote.core import Segment
import gradio as gr
import yt_dlp as youtube_dl
from gradio_client import Client
from transformers.pipelines.audio_utils import ffmpeg_read
HF_TOKEN = "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"
# set up the diarization pipeline
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token=HF_TOKEN)
if torch.cuda.is_available():
diarization_pipeline.to(torch.device("cuda"))
import gradio as gr
def transcribe(audio_path):
# Run diarization while we wait for Whisper JAX
diarization = diarization_pipeline(audio_path)
# Segments = diarization.for_json()["content"]
# Segments = str(diarization)
transcription = "SAML Output"
return diarization
title = "SAML Speaker Diarization ⚡️"
description = """Combine the speed of Whisper JAX with pyannote speaker diarization to transcribe meetings in super fast time. Demo uses Whisper JAX as an [endpoint](https://twitter.com/sanchitgandhi99/status/1656665496463495168) and pyannote speaker diarization running locally. The Whisper JAX endpoint is run asynchronously, meaning speaker diarization is run in parallel to the speech transcription. The diarized timestamps are aligned with the Whisper output to give the final speaker-segmented transcription.
To duplicate the demo, first accept the pyannote terms of use for the [speaker diarization](https://huggingface.co/pyannote/speaker-diarization) and [segmentation](https://huggingface.co/pyannote/segmentation) models. Then, click [here](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax-diarization?duplicate=true) to duplicate the demo, and enter your Hugging Face access token as a Space secret when prompted.
"""
article = """Whisper large-v2 model by OpenAI. Speaker diarization model by pyannote. Whisper JAX backend running JAX on a TPU v4-8 through the generous support of the [TRC](https://sites.research.google/trc/about/) programme. Whisper JAX [code](https://github.com/sanchit-gandhi/whisper-jax) and Gradio demo by 🤗 Hugging Face."""
import gradio as gr
def greet(name):
return "Hello " + name + "!!"
iface = gr.Interface(fn=transcribe, inputs=gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"), outputs="text")
iface.launch()
# audio_file = gr.Interface(
# fn=transcribe,
# inputs=[
# gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
# # gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
# # gr.inputs.Checkbox(default=True, label="Group by speaker"),
# ],
# outputs=[
# gr.outputs.Textbox(label="Transcription").style(show_copy_button=True)
# ],
# allow_flagging="auto",
# title=title,
# description=description,
# article=article,
# )
# demo = gr.Blocks()
# with demo:
# gr.TabbedInterface([audio_file], ["Audio File"])
# demo.launch()
# def transcribe(audio_path, task="transcribe", group_by_speaker=True, progress=gr.Progress()):
# # run diarization while we wait for Whisper JAX
# progress(0, desc="Diarizing...")
# diarization = diarization_pipeline(audio_path)
# print(diarization)
# #segments = diarization.for_json()["content"]
# #segments = str(diarization)
# transcription = "SAML Output"
# return transcription
# title = "SAML Speaker Diarization ⚡️"
# description = """Combine the speed of Whisper JAX with pyannote speaker diarization to transcribe meetings in super fast time. Demo uses Whisper JAX as an [endpoint](https://twitter.com/sanchitgandhi99/status/1656665496463495168) and pyannote speaker diarization running locally. The Whisper JAX endpoint is run asynchronously, meaning speaker diarization is run in parallel to the speech transcription. The diarized timestamps are aligned with the Whisper output to give the final speaker-segmented transcription.
# To duplicate the demo, first accept the pyannote terms of use for the [speaker diarization](https://huggingface.co/pyannote/speaker-diarization) and [segmentation](https://huggingface.co/pyannote/segmentation) models. Then, click [here](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax-diarization?duplicate=true) to duplicate the demo, and enter your Hugging Face access token as a Space secret when prompted.
# """
# article = "Whisper large-v2 model by OpenAI. Speaker diarization model by pyannote. Whisper JAX backend running JAX on a TPU v4-8 through the generous support of the [TRC](https://sites.research.google/trc/about/) programme. Whisper JAX [code](https://github.com/sanchit-gandhi/whisper-jax) and Gradio demo by 🤗 Hugging Face."
# audio_file = gr.Interface(
# fn=transcribe,
# inputs=[
# gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
# gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
# gr.inputs.Checkbox(default=True, label="Group by speaker"),
# ],
# outputs=[
# gr.outputs.Textbox(label="Transcription").style(show_copy_button=True)
# ],
# allow_flagging="never",
# title=title,
# description=description,
# article=article,
# )
# demo = gr.Blocks()
# with demo:
# gr.TabbedInterface([ audio_file], [ "Audio File"])
# demo.queue(max_size=10)
# demo.launch(show_api=True)
# # def greet(name):
# # return "Hello " + name + "!!"
# # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
# # iface.launch() |