asistente_voz / app.py
mrolando
added state
56cf024
from transformers import pipeline
import torch
import os
import openai
from dotenv import load_dotenv
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
model_id = "openai/whisper-base"
pipe = pipeline("automatic-speech-recognition", model=model_id)
def transcribe_speech(filepath):
output = pipe(
filepath,
max_new_tokens=256,
generate_kwargs={
"task": "transcribe",
"language": "spanish",
}, # update with the language you've fine-tuned on
chunk_length_s=30,
batch_size=8,
)
return output["text"]
# Load environment variables from the .env file de forma local
load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']
def clear_chat():
global chat_history
chat_history=[]
def query_chatgpt(message,chat_history):
chat_history.append({'role': 'user', 'content': '{}'.format(message)})
print("Preguntando "+message)
print("historial", chat_history)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages= chat_history,
temperature=0.5,
max_tokens=256
).choices[0].message.content
chat_history.append({'role': 'assistant', 'content': '{}'.format(response)})
return response, chat_history
# models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
# "facebook/tts_transformer-es-css10",
# arg_overrides={"vocoder": "hifigan", "fp16": False}
# )
# model = models[0]
# TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
# generator = task.build_generator([model], cfg)
# text = "Había una vez."
# sample = TTSHubInterface.get_model_input(task, text)
# wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)
# ipd.Audio(wav, rate=rate)
from tts import synthesize
# def syn_facebookmms(text):
# sample = TTSHubInterface.get_model_input(task, text)
# wav,rate = TTSHubInterface.get_prediction(task, model, generator, sample)
# return wav,rate
def answer_question(filepath,chat_history):
transcription = transcribe_speech(filepath)
response,chat_history = query_chatgpt(transcription,chat_history)
print("historial",chat_history)
# audio = synthesise(response)
# audio, rate = syn_facebookmms(response)
rate,audio = synthesize(response,1,"spa")
print(audio)
return rate,audio
def reset_state(chat_history):
chat_history = []
return chat_history
import gradio as gr
with gr.Blocks() as demo:
chat_history = gr.State([])
entrada = gr.Audio(source="microphone",type="filepath")
boton = gr.Button("Responder")
button = gr.Button("Reset State")
salida = gr.Audio()
boton.click(answer_question,[entrada,chat_history],salida)
button.click(reset_state,chat_history,chat_history)
demo.launch(debug=True)