import os huggingface_token = os.getenv('Hugging_Face_Token') # Set up Hugging Face API token for authenticated requests from huggingface_hub.hf_api import HfFolder HfFolder.save_token(huggingface_token) from huggingface_hub import HfApi available_pipelines = [p.modelId for p in HfApi().list_models(filter="pyannote-audio-pipeline")] list(filter(lambda p: p.startswith("pyannote/"), available_pipelines)) from pyannote.audio import Pipeline pipeline_diar = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=True) import torch import torchaudio import base64 from io import BytesIO from pyannote.core import Annotation from pydub import AudioSegment from transformers import pipeline transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=False) import gradio as gr def transcribe_recording(base64_encoded_audio): """ Transcribes audio data given as base64 encoded byte string into text and yields the transcribed text incrementally as it becomes available. Parameters: - base64_audio (str): Base64 encoded byte string of audio data. Yields: - str: Transcribed text incrementally as it becomes available. """ # Decode the base64 encoded string base64_decoded_audio = base64.b64decode(base64_encoded_audio) with BytesIO(base64_decoded_audio) as audio_buffer: audio_buffer.seek(0) # Load audio with pydub for easy slicing audio = AudioSegment.from_file(audio_buffer) audio.export("out.mp3", format="mp3") # Run diarization pipeline dia = pipeline_diar("out.mp3") assert isinstance(dia, Annotation) # Prepare a list to store data data = [] for i, (speech_turn, track, speaker) in enumerate(dia.itertracks(yield_label=True)): # Extract start and end times start_time, end_time = speech_turn.start, speech_turn.end # Extract and transcribe the audio segment segment_audio = audio[int(start_time * 1000):int(end_time * 1000)] segment_audio.export("out.mp3", format="mp3") text = transcriber("out.mp3")['text'] # Append the data to the list data.append({ "Start Time": start_time, "End Time": end_time, "Speaker": speaker, "Transcription": text }) return data tanscriber = gr.Interface( fn=transcribe_recording, inputs="text", outputs="text", title="whisper transcriber", ).launch()