parlons-nous / app.py
aurelben's picture
change whisper version
9c7b527
raw
history blame
2.78 kB
import os
import gradio as gr
import numpy as np
import torch
from groq import Groq
from transformers import pipeline
from TTS.api import TTS
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
def use_pipe(inputs):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
return text
groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))
def transcribe(stream, new_chunk):
"""
Transcribes using whisper
"""
sr, y = new_chunk
# Convert stereo to mono if necessary
if y.ndim == 2 and y.shape[1] == 2:
y = y.mean(axis=1) # Averaging both channels if stereo
y = y.astype(np.float32)
# Normalization
y /= np.max(np.abs(y))
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
return stream, use_pipe(stream)
def autocomplete(text):
"""
Autocomplete the text using Gemma.
"""
if text != "":
response = groq_client.chat.completions.create(
model='llama3-8b-8192',
messages=[{"role": "system", "content": "Tu es une assistante tres polis, tu ne repond que en francais et uniquement en utilisant le vous et jamais le tu"},
{"role": "user", "content": text}]
)
return response.choices[0].message.content
def process_audio(input_audio, new_chunk):
"""
Process the audio input by transcribing and completing the sentences.
Accumulate results to return to Gradio interface.
"""
stream, transcription = transcribe(input_audio, new_chunk)
text = autocomplete(transcription)
print (transcription, text)
api = TTS("tts_models/multilingual/multi-dataset/xtts", gpu=True)
api.tts_to_file(text, file_path="output.wav", language="fr")
audio = "./output.wav"
return stream, text, audio
demo = gr.Interface(
fn = process_audio,
inputs = ["state", gr.Audio(sources=["microphone"], streaming=True)],
outputs = ["state", gr.Markdown(), gr.Audio(interactive=False, autoplay=True)],
title="Parlons nous ☎️",
description="Powered by [whisper-base-en](https://huggingface.co/openai/whisper-base.en), and [gemma-7b-it](https://huggingface.co/google/gemma-7b-it) (via [Groq](https://groq.com/))",
live=True,
allow_flagging="never"
)
demo.launch()