jilijeanlouis's picture
Update app.py
bcf8a6e
import os
from time import time
import gradio as gr
import requests
from languages import LANGUAGES
GLADIA_API_KEY = os.environ.get("GLADIA_API_KEY")
headers = {
"accept": "application/json",
"x-gladia-key": GLADIA_API_KEY,
}
ACCEPTED_LANGUAGE_BEHAVIOUR = [
"manual",
"automatic single language",
"automatic multiple languages",
]
def transcribe(
audio: str = None,
) -> dict:
"""
This function transcribes audio to text using the Gladia API.
It sends a request to the API with the given audio file or audio URL, and returns the transcribed text.
Get your api key at gladia.io !
Parameters:
audio (str): The path to the audio file to transcribe.
Returns:
dict: A dictionary containing the transcribed text and other metadata about the transcription process. If an error occurs, the function returns a string with an error message.
"""
DEFAULT_MANUAL_LANGUAGE = "english"
language_behaviour = ACCEPTED_LANGUAGE_BEHAVIOUR[2]
# if video file is there then send the audio field as the content of the video
# if video file is there then send the audio field as the content of the video
files = {
"language_behaviour": (None, language_behaviour),
"noise_reduction": (None, "false"),
'output_format': (None, 'json'),
'toggle_diarization': (None, 'true'),
'diarization_max_speakers': (None, '2'),
}
# priority given to the audio or video
if audio:
files["audio"] = (audio, open(audio, "rb"), "audio/wav")
# if language is manual then send the language field
# if it's there for language_behaviour == automatic*
# it will ignored anyways
if language_behaviour == "manual":
files["language"] = (None, DEFAULT_MANUAL_LANGUAGE)
start_transfer = time()
response = requests.post(
"https://api.gladia.io/audio/text/audio-transcription/",
headers=headers,
files=files,
)
end_transfer = time()
if response.status_code != 200:
print(response.content, response.status_code)
return "Sorry, an error occured with your request :/"
# we have 2 outputs:
# prediction and prediction_raw
# prediction_raw has more details about the processing
# and other debugging detailed element you might be
# interested in
segments = response.json()["prediction"]
output = ""
current_speaker = ""
for segment in segments:
if segment["speaker"] != current_speaker and segment["speaker"]!= "unknown":
current_speaker = segment["speaker"]
output = output + "<br/><br/><b> Speaker:" + str(segment["speaker"]) + ":</b> " + segment["transcription"]
else:
output = output + " " + segment["transcription"]
return output, response.json()["prediction_raw"]
iface = gr.Interface(
title="Gladia.io fast audio transcription",
description="""Gladia.io Whisper large-v2 fast audio transcription API
is able to perform fast audio transcriptions for any audio / video (less than a minute per hour) .<br/>For more details and a benchmark ran on multiple Speech-To-Text providers, please visit
[our post](https://medium.com/@gladia.io/gladia-alpha-launch-redefining-what-s-possible-with-speech-to-text-ai-686dd4312a86) on Medium.
<br/><br/>
You are more than welcome to join us on [Slack](https://gladia-io.slack.com)
and don't forget to get your own API key on [Gladia.io](https://gladia.io/) during the free alpha !
""",
fn=transcribe,
inputs=[
gr.Audio(label="Audio file", source="upload", type="filepath"),
],
outputs=["html", "json"],
examples=[
["examples/good.will.hunting.wav"],
["examples/wolf.of.wall.street.wav"],
],
)
iface.queue()
iface.launch()