Spaces:
Sleeping
Sleeping
import os | |
import io | |
import wave | |
import numpy as np | |
import gradio as gr | |
from openai import OpenAI | |
import google.generativeai as genai | |
from dotenv import load_dotenv, find_dotenv | |
load_dotenv(find_dotenv()) | |
from match_info_crawler import get_matches_info | |
USE_LOCAL_ASR_PIPELINE = True | |
# used for chat, if provided | |
GOOGLE_API_KEY = "" #if 'GOOGLE_API_KEY' not in os.environ else os.environ['GOOGLE_API_KEY'] | |
# used for chat (2nd option) and for text-to-speech | |
OPENAI_API_KEY = "" if 'OPENAI_API_KEY' not in os.environ else os.environ['OPENAI_API_KEY'] | |
# used for speech recognition, if USE_LOCAL_ASR_PIPELINE is true | |
assert 'HUGGINGFACE_API_KEY' in os.environ, "Hugging Face API key not found in environment variables" | |
USE_OPENAI_FOR_CHAT = (GOOGLE_API_KEY == "") | |
OPENAI_CLIENT = None | |
if OPENAI_API_KEY != "": | |
OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY) | |
if GOOGLE_API_KEY != "": | |
genai.configure(api_key=GOOGLE_API_KEY) | |
GOOGLE_GEN_CONFIG = genai.types.GenerationConfig( | |
candidate_count=1, | |
temperature=0.5) | |
AUDIO_OUT_FILE_PREFIX = "output" # prefixo do nome do arquivo de áudio .wav | |
TEMPLATE_SYSTEM_MESSAGE = """Você é assistente virtual com a função é entreter uma criança de idade entre 6 e 8 anos que adora futebol. Diretrizes para a conversa: | |
- Você é {GENRE}, seu nome é {NAME}. | |
- {PERSONALITY} | |
- Pergunte o nome da criança. | |
- Fale sobre futebol, times, jogadores, seleções e grandes jogos. | |
- Tente focar em Brasil, Inglaterra e Espanha. | |
- Você também pode informar os resultados de jogos de ontem, e jogos que ocorrerão hoje ou amanhã. | |
- Fale, no máximo, três frases por mensagem. | |
""" | |
# Mapeia a personalidade no template e na temperatura | |
PERSONALITIES = { | |
"nova": ("Sua personalidade é bastante amigável e alegre, e um tanto infantil. Tente iniciar novos assuntos, quando a conversa estiver repetitiva. Conte piadas de futebol, de vez em quando.", 0.8, "F"), | |
"echo": ("Sua personalidade é amigável, mas objetivo. Tente manter-se no mesmo assunto. Conte alguma curiosidade sobre um grande craque, de vez em quando.", 0.2, "M") | |
} | |
INITIAL_PERSON = "nova" | |
# Função para converter o histórico de chat para o formato esperado pela API do OpenAI | |
def to_openai_chat_history(system_prompt, chat_history, curr_message): | |
prompt = [ { 'role': 'system', 'content': system_prompt } ] | |
if len(chat_history) > 10: | |
chat_history = chat_history[0:3] + chat_history[-5:] | |
for turn in chat_history: | |
user_message, bot_message = turn | |
prompt.append( {'role': 'user', 'content': user_message} ) | |
prompt.append( {'role': 'assistant', 'content': bot_message} ) | |
prompt.append( {'role': 'user', 'content': curr_message } ) | |
return prompt | |
# Função para converter o histórico de chat para o formato esperado pela API do Google AI | |
def to_google_history(chat_history, curr_user_message=None): | |
prompt = [] | |
for turn in chat_history: | |
user_message, bot_message = turn | |
prompt.append( {'role':'user', 'parts': [user_message]} ) | |
prompt.append( {'role': 'model', 'parts': [bot_message]} ) | |
if curr_user_message is not None: | |
prompt.append( {'role': 'user', 'parts': [curr_user_message]} ) | |
return prompt | |
import json | |
TOOLS_SPECIFICATION_OPENAI = [ | |
{ | |
"type": "function", | |
"function": { | |
"name": "get_matches_info", | |
"description": "Use this function to retrieve information about football (soccer) matches from the most important leagues. Time of the matches is given in Brazilian timezone.", | |
#+ "Returns a string with one matche per line; or empty string if the service is not available now.", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"date_str": { | |
"type": "string", | |
"description": "Must be one of these: 'yesterday', 'today' or 'tomorrow'. No other option is valid." | |
} | |
}, | |
"required": ["date_str"], | |
}, | |
} | |
} | |
] | |
def process_wave(audio_bytes): | |
audio_file = io.BytesIO(audio_bytes) | |
# Read the wave file using the wave module | |
wave_file = wave.open(audio_file) | |
# Get audio parameters | |
#num_channels = wave_file.getnchannels() | |
frame_rate = wave_file.getframerate() | |
#sample_width = wave_file.getsampwidth() | |
num_frames = wave_file.getnframes() | |
# Read the audio data as a NumPy array | |
audio_array = np.frombuffer(wave_file.readframes(num_frames), dtype=np.int16) | |
return (frame_rate, audio_array) | |
def respond(system_prompt, user_message, chat_history, temperature, persona="echo"): | |
if USE_OPENAI_FOR_CHAT: | |
openai_history = to_openai_chat_history(system_prompt, chat_history, user_message) | |
bot_response = OPENAI_CLIENT.chat.completions.create(messages=openai_history, | |
temperature=temperature, | |
tools=TOOLS_SPECIFICATION_OPENAI, | |
model="gpt-3.5-turbo-0125") | |
bot_response = bot_response.choices[0].message | |
if bot_response.tool_calls: | |
assert bot_response.tool_calls[0].function.name == "get_matches_info", "Invalid tool call in response." | |
print("Processing tool call...") | |
date_str = json.loads(bot_response.tool_calls[0].function.arguments)["date_str"] | |
results = get_matches_info(date_str) | |
openai_history.append({"role": "function", "tool_call_id": bot_response.tool_calls[0].id, "name": bot_response.tool_calls[0].function.name, "content": results}) | |
# nesta chamada, não passo o tools, para economizar tokens | |
bot_response = OPENAI_CLIENT.chat.completions.create(messages=openai_history, | |
temperature=temperature, | |
model="gpt-3.5-turbo-0125") | |
bot_response = bot_response.choices[0].message | |
assistant_msg = bot_response.content | |
else: | |
GOOGLE_GEN_CONFIG.temperature = temperature | |
model = genai.GenerativeModel('gemini-1.5-pro-latest', | |
system_instruction=system_prompt, | |
tools=[get_matches_info], | |
generation_config=GOOGLE_GEN_CONFIG) | |
google_history = to_google_history(chat_history) | |
chat = model.start_chat(history=google_history, | |
enable_automatic_function_calling=True) | |
bot_response = chat.send_message(user_message) | |
assistant_msg = bot_response.text | |
# salva o audio | |
response = OPENAI_CLIENT.audio.speech.create( | |
model="tts-1", | |
voice=persona, | |
input=assistant_msg, | |
response_format='wav' # se for salvar em arquivo, (acho) pode usar 'mp3' | |
) | |
# adiciona ao chat, com o tipo de dado esperado pelo Gradio | |
chat_history.append( (user_message, assistant_msg) ) | |
return "", chat_history, process_wave(response.content) | |
def reset_and_apply(voice): | |
return [("", "Olá, vamos falar de futebol?")], AUDIO_OUT_FILE_PREFIX + f"-001-{voice}.wav" | |
def reset_openai_client(openai_key): | |
global USE_OPENAI_FOR_CHAT, OPENAI_CLIENT, OPENAI_API_KEY | |
USE_OPENAI_FOR_CHAT = (GOOGLE_API_KEY == "") | |
OPENAI_API_KEY = openai_key | |
if OPENAI_API_KEY != "": | |
OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY) | |
def reset_google_client(google_key): | |
global GOOGLE_API_KEY, USE_OPENAI_FOR_CHAT | |
USE_OPENAI_FOR_CHAT = (google_key == "") | |
GOOGLE_API_KEY = google_key | |
if GOOGLE_API_KEY != "": | |
genai.configure(api_key=GOOGLE_API_KEY) | |
def on_voice_change(voice): | |
persona_description, persona_temperature, sex = PERSONALITIES[voice] | |
genre = "menina" if sex=="F" else "menino" | |
return TEMPLATE_SYSTEM_MESSAGE.format(NAME=voice.upper(), PERSONALITY=persona_description, GENRE=genre), persona_temperature | |
# With pipeline (downloaded model) | |
if USE_LOCAL_ASR_PIPELINE: | |
from transformers import pipeline | |
import numpy as np | |
global ASR_PIPELINE | |
ASR_PIPELINE = pipeline(task="automatic-speech-recognition", | |
#model="openai/whisper-large-v3") | |
model="openai/whisper-small") | |
else: | |
import requests | |
global ASR_API_URL, ASR_API_HEADERS | |
HF_KEY = os.environ['HUGGINGFACE_API_KEY'] | |
# Serverless API endpoint for OpenAI's Whisper model | |
#ASR_API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3" | |
ASR_API_URL = "https://api-inference.huggingface.co/models/openai/whisper-small" | |
ASR_API_HEADERS = {"Authorization": f"Bearer {HF_KEY}"} | |
def transcribe(audio_file): | |
if USE_LOCAL_ASR_PIPELINE: | |
response = ASR_PIPELINE(audio_file) | |
text = response["text"] | |
else: | |
# using serverless API | |
with open(audio_file, "rb") as f: | |
data = f.read() | |
response = requests.post(ASR_API_URL, headers=ASR_API_HEADERS, data=data) | |
text = response.json()["text"] | |
return text | |
def transcribe_and_respond(audio_in, system_txtbox, user_msg_txb, *args): | |
transcribed_user_msg = transcribe(audio_in) | |
outputs = respond(system_txtbox, transcribed_user_msg, *args) | |
return outputs | |
with gr.Blocks() as demo: | |
# aqui, é resetado e instanciado o cliente | |
initial_chat_history, initial_audio = reset_and_apply(INITIAL_PERSON) | |
chatbot_area = gr.Chatbot(value=initial_chat_history) | |
audio_out = gr.Audio(label="Escute a última mensagem", value=initial_audio, autoplay=True, interactive=False) | |
user_msg_txb = gr.Textbox(label="Mensagem") | |
audio_in = gr.Audio(label="Mensagem de Áudio", sources=['microphone'], interactive=True, type='filepath') | |
submit_btn = gr.Button("Enviar") | |
#clear_btn = gr.ClearButton(components=[user_msg, chatbot], value="Clear console") | |
reset_btn = gr.Button("Reiniciar") | |
with gr.Accordion(label="Configurações",open=False): | |
openai_key = gr.Textbox(label="OpenAI API Key (GPT e vozes)", value="", placeholder="Insira a chave aqui") | |
openai_key.change(reset_openai_client, inputs=[openai_key]) | |
#openai_key = gr.Textbox(label="Google API Key (Gemini 1.5)", value="", placeholder="Insira a chave aqui") | |
#openai_key.change(reset_google_client, inputs=[openai_key]) | |
# opções de vozes e personalidades | |
voice_ddown = gr.Dropdown(label="Personalidade (muda os dois abaixo)", choices=["nova", "echo"], value=INITIAL_PERSON) | |
initial_system_message, initial_temperature = on_voice_change(INITIAL_PERSON) | |
temperature_sldr = gr.Slider(label="Diversidade de respostas", minimum=0.0, maximum=1.0, value=initial_temperature, step=0.1) | |
with gr.Accordion(label="Avançado",open=False): | |
# o valor inicial é dado pela system message com o nome e personalidade dados pelos controles acima | |
system_txtbox = gr.Textbox(label="System message", lines=3, value=initial_system_message) | |
voice_ddown.change(on_voice_change, inputs=[voice_ddown], outputs=[system_txtbox, temperature_sldr]) | |
#gr.Markdown("*Clique em 'Reiniciar' para aplicar as (a maior parte das) configurações.*") | |
reset_btn.click(reset_and_apply, inputs=[voice_ddown], outputs=[chatbot_area, audio_out]) | |
audio_in.stop_recording( transcribe_and_respond, inputs=[audio_in, system_txtbox, user_msg_txb, chatbot_area, temperature_sldr, voice_ddown], outputs=[user_msg_txb, chatbot_area, audio_out] ) | |
submit_btn.click(respond, inputs=[system_txtbox, user_msg_txb, chatbot_area, temperature_sldr, voice_ddown], outputs=[user_msg_txb, chatbot_area, audio_out]) # Click on the button | |
user_msg_txb.submit(respond, inputs=[system_txtbox, user_msg_txb, chatbot_area, temperature_sldr, voice_ddown], outputs=[user_msg_txb, chatbot_area, audio_out]) # Press enter to submit - same effect | |
demo.queue().launch(share=False) | |