import gradio as gr import torch import io import base64 import numpy as np import scipy.io.wavfile from typing import Text from pyannote.audio import Pipeline from pyannote.audio import Audio from pyannote.core import Segment import gradio as gr import yt_dlp as youtube_dl from gradio_client import Client from transformers.pipelines.audio_utils import ffmpeg_read import pyannote.core.json HF_TOKEN = "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" # set up the diarization pipeline diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token=HF_TOKEN) def transcribe(audio_path, task="transcribe", group_by_speaker=True, progress=gr.Progress()): # # run Whisper JAX asynchronously using Gradio client (endpoint) # job = client.submit( # audio_path, # task, # True, # api_name="/predict_1", # ) # run diarization while we wait for Whisper JAX progress(0, desc="Diarizing...") # diarization = diarization_pipeline(audio_path) # segments = diarization.for_json()["content"] # convert diarization to JSON diarization_json = pyannote.core.json.for_json(diarization) segments = diarization_json["content"] transcription = segments # # only fetch the transcription result after performing diarization # progress(0.33, desc="Transcribing...") # transcription, _ = job.result() # # align the ASR transcriptions and diarization timestamps # progress(0.66, desc="Aligning...") # transcription = align(transcription, segments, group_by_speaker=group_by_speaker) return transcription audio_file = gr.Interface( fn=transcribe, inputs=[ gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"), gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"), gr.inputs.Checkbox(default=True, label="Group by speaker"), ], outputs=[ gr.outputs.Textbox(label="Transcription").style(show_copy_button=True), ] # allow_flagging="never", # title=title, # description=description, # article=article, ) audio_file.launch() # demo = gr.Blocks() # demo.queue(max_size=10) # demo.launch() # def greet(name): # return "Hello " + name + "!!" # iface = gr.Interface(fn=greet, inputs="text", outputs="text") # iface.launch()