Spaces:

aurelben
/

parlons-nous

Sleeping

File size: 2,831 Bytes

f3b0ffc
 
 
 
 
 
 
 
 
23ed361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3b0ffc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23ed361
f3b0ffc
 
 
 
 
 
 
649e72e
 
f3b0ffc
 
 
 
 
 
 
 
 
 
 
 
 
8a93bca
250bb3e
 
31edfa9
203a8ab
f3b0ffc
 
 
 
 
203a8ab
8a93bca
f3b0ffc

import os

import gradio as gr
import numpy as np
import torch
from groq import Groq
from transformers import pipeline
from TTS.api import TTS

MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)


def use_pipe(inputs):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
    return  text


groq_client = Groq(api_key=os.getenv('GROQ_API_KEY'))

def transcribe(stream, new_chunk):
    """
    Transcribes using whisper
    """
    sr, y = new_chunk

    # Convert stereo to mono if necessary
    if y.ndim == 2 and y.shape[1] == 2:
        y = y.mean(axis=1)  # Averaging both channels if stereo
    
    y = y.astype(np.float32)

    # Normalization
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    return stream, use_pipe(stream)

def autocomplete(text):
    """
    Autocomplete the text using Gemma.
    """
    if text != "":
        response = groq_client.chat.completions.create(
            model='llama3-8b-8192',
            messages=[{"role": "system", "content": "Tu es une assistante tres polis, tu ne repond que en francais et uniquement en utilisant le vous et jamais le tu"},
                      {"role": "user", "content": text}]
            )
            
        return response.choices[0].message.content

def process_audio(input_audio, new_chunk):
    """
    Process the audio input by transcribing and completing the sentences.
    Accumulate results to return to Gradio interface.
    """

    stream, transcription = transcribe(input_audio, new_chunk)
    text = autocomplete(transcription)
    print (transcription, text)
    api = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
    api.tts_to_file(text, file_path="output.wav", speaker="Ana Florence",language="fr", split_sentences=True)
    audio = "./output.wav"
    return stream, text, audio


demo = gr.Interface(
    fn = process_audio,
    inputs = ["state", gr.Audio(sources=["microphone"], streaming=True)],
    outputs = ["state", gr.Markdown(), gr.Audio(interactive=False, autoplay=True)],
    title="Parlons nous ☎️",
    description="Powered by [whisper-base-en](https://huggingface.co/openai/whisper-base.en), and [gemma-7b-it](https://huggingface.co/google/gemma-7b-it) (via [Groq](https://groq.com/))",
    live=True,
    allow_flagging="never"
)

demo.launch()