Spaces:

Lenylvt
/

BetterWhisper

Sleeping

File size: 2,309 Bytes

import streamlit as st
from gradio_client import Client
import re
import os
import base64

st.title("Application de transcription Whisper-JAX 🎙️")

# Specify the API URL
API_URL = "https://sanchit-gandhi-whisper-jax-spaces.hf.space"

# Initialize the Gradio client with the API URL
client = Client(API_URL)
client.view_api(return_format="dict")

# Function to transcribe an audio file using the specified API endpoint
def transcrire_audio(file_data, task="transcribe", return_timestamps=True):
    """Function to transcribe an audio file using the Whisper-JAX API endpoint."""
    # Encode the file data to base64
    base64_encoded_data = base64.b64encode(file_data).decode('utf-8')

    # Prepare and send the request
    response = client.predict(
        base64_encoded_data,
        task,
        return_timestamps,
        api_name="/predict_1"  # Make sure this is the correct endpoint
    )
    return response[0], response[1]  # Adjust according to the response structure returned by the API

# Streamlit widget to upload an audio file
fichier_telecharge = st.file_uploader("Choisissez un fichier audio", type=['mp3', 'wav', 'ogg'])

# Button to process the audio file
if st.button("Transcrire l'audio"):
    if fichier_telecharge is not None:
        # Read the file into memory
        file_data = fichier_telecharge.getvalue()

        # Call the transcription function
        try:
            transcription, runtime = transcrire_audio(file_data)
            st.write("Transcription avec horodatage :", transcription)

            # Display transcription without timestamps
            transcription_sans_horodatages = remove_timestamps(transcription)
            st.write("Transcription sans horodatage :", transcription_sans_horodatages)
        except Exception as e:
            st.error(f"Une erreur est survenue lors de la transcription : {str(e)}")
    else:
        st.error("Veuillez télécharger un fichier audio pour continuer.")

# Function to remove timestamps from text
def remove_timestamps(text):
    # Pattern to match timestamps in the format [HH:MM:SS.mmm -> HH:MM:SS.mmm]
    pattern = r"\[\d{2}:\d{2}:\d{2}\.\d{3} -> \d{2}:\d{2}:\d{2}\.\d{3}\]\s*"
    # Replace matched patterns with an empty string
    cleaned_text = re.sub(pattern, "", text)
    return cleaned_text