parlons-nous / app.py
aurelben's picture
change whisper version
8e30982
raw
history blame contribute delete
No virus
2.95 kB
import os
import gradio as gr
import numpy as np
import torch
from groq import Groq
from transformers import pipeline
from TTS.api import TTS
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
def use_pipe(inputs):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
return text
groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))
def transcribe(stream, new_chunk):
"""
Transcribes using whisper
"""
sr, y = new_chunk
# Convert stereo to mono if necessary
if y.ndim == 2 and y.shape[1] == 2:
y = y.mean(axis=1) # Averaging both channels if stereo
y = y.astype(np.float32)
# Normalization
y /= np.max(np.abs(y))
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
return stream, use_pipe(stream)
def autocomplete(text):
"""
Autocomplete the text using Gemma.
"""
if text != "":
response = groq_client.chat.completions.create(
model='llama3-70b-8192',
messages=[{"role": "system", "content": "Tu es une assistante polie, tu ne repond qu'en francais et uniquement en utilisant le le vouvoiement et avec des phrases le plus courtes possible"},
{"role": "user", "content": text}]
)
return response.choices[0].message.content
def process_audio(input_audio, new_chunk):
"""
Process the audio input by transcribing and completing the sentences.
Accumulate results to return to Gradio interface.
"""
stream, transcription = transcribe(input_audio, new_chunk)
text = autocomplete(transcription)
print (transcription, text)
api = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2").to(device)
api.tts_to_file(text, file_path="output.wav", speaker="Ana Florence",language="fr",split_sentences=True)
audio = "./output.wav"
return stream, text, audio
demo = gr.Interface(
fn = process_audio,
inputs = ["state", gr.Audio(sources=["microphone"], streaming=True)],
outputs = ["state", gr.Markdown(), gr.Audio(interactive=False, autoplay=True)],
title="Parlons nous ☎️",
description="Powered by [whisper-base-en](https://huggingface.co/openai/whisper-base.en), and [gemma-7b-it](https://huggingface.co/google/gemma-7b-it) (via [Groq](https://groq.com/))",
live=True,
allow_flagging="never"
)
demo.launch()