audio-to-text

Runtime error

File size: 8,067 Bytes

167bfa7
 
fc90c72
167bfa7
 
 
 
 
 
24accb9
167bfa7
 
fc90c72
da29724
 
 
408b918
da29724
 
 
87d0504
 
 
37d1a28
87d0504
 
da29724
18f7da1
167bfa7
18f7da1
 
167bfa7
cd58f6d
167bfa7
 
fc90c72
167bfa7
5142028
167bfa7
 
da29724
 
 
 
 
167bfa7
 
 
24accb9
167bfa7
 
 
 
24accb9
167bfa7
 
24accb9
167bfa7
 
 
24accb9
167bfa7
24accb9
167bfa7
 
 
 
24accb9
167bfa7
 
 
24accb9
 
167bfa7
 
 
 
24accb9
 
167bfa7
 
24accb9
167bfa7
 
24accb9
167bfa7
 
 
24accb9
167bfa7
 
 
 
 
24accb9
167bfa7
24accb9
167bfa7
 
24accb9
167bfa7
 
 
24accb9
167bfa7
24accb9
 
167bfa7
 
 
24accb9
167bfa7
 
 
 
 
 
 
24accb9
167bfa7
 
24accb9
167bfa7
 
d3a9b11
24accb9
167bfa7
 
 
 
 
24accb9
d3a9b11
7cdad84
24accb9
d3a9b11
24accb9
 
d3a9b11
 
 
 
 
 
 
 
 
 
 
 
24accb9
d3a9b11
24accb9
 
 
 
167bfa7
 
7cdad84
24accb9
 
18f7da1
24accb9
 
 
 
 
 
 
9cc6fc1
24accb9
d3a9b11
 
24accb9
 
d3a9b11
7cdad84
24accb9
d3a9b11
 
 
 
 
 
 
 
 
 
 
 
 
167bfa7
d3a9b11
 
167bfa7
 
24accb9
d3a9b11
167bfa7
da29724
d3a9b11
167bfa7
d3a9b11
da29724
d3a9b11
 
167bfa7
da29724
 
d3a9b11
167bfa7
da29724
87d0504
167bfa7
1f81c82
d3a9b11

import os
import re
import gradio as gr
import tempfile
from pydub import AudioSegment
from pydub.utils import which
import edge_tts
import asyncio
import nest_asyncio
import requests
nest_asyncio.apply()
from openai import OpenAI


secret=os.getenv("SECRET")
url=os.getenv("SRVC")
url_audio=os.getenv("TRANSCRIPTION")
key=os.getenv("KEY")
model=os.getenv("MODEL")

description = """
<center>
<img src="https://huggingface.co/spaces/rodrigomasini/audio-to-text/resolve/main/chagas.png" width=200px>
<strong>Primeiro assistente de IA de voz do Brasil</strong>
</center>
"""

OPENAI_API_KEY = secret
sync_client = OpenAI(
    base_url=url,
    api_key=key
)

# Ensuring pydub can locate ffmpeg
AudioSegment.converter = which("ffmpeg")

# TELA endpoint for speech-to-text generation
TELA_TRANSCRIPT_AUDIO_URL = url_audio

system_instruction = """
A partir de agora, o seu nome é Chagas, um assistente virtual que fala português.
Durante a interação com o usuário, você deve responder e manter a conversa de forma amigável, concisa, clara e aberta.
Evite qualquer introdução desnecessária.
Responda em um tom amigável de conversação e sempre empático e suportivo.
Nunca retorne a sua resposta em formato Markdown. E sempre, sempre retorne na forma de frases, mesmo se a sua resposta for uma lista.
"""

def convert_to_mp3(audio_file_path):
    print("[DEBUG] Starting audio conversion to mp3.")
    temp_mp3 = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
    try:
        audio = AudioSegment.from_file(audio_file_path)
        audio.export(temp_mp3.name, format="mp3")
        print(f"[DEBUG] Successfully converted to mp3: {temp_mp3.name}")
        return temp_mp3.name
    except Exception as e:
        print(f"[ERROR] Error converting audio: {e}")
        return None

def transcript(audio_file_path):
    print("[DEBUG] Starting transcription process.")
    if audio_file_path is None:
        print("[ERROR] No audio file provided.")
        return {"data": "failed", "error": "No audio file provided."}

    mp3_file_path = convert_to_mp3(audio_file_path)
    if not mp3_file_path:
        print("[ERROR] Failed to convert audio to mp3.")
        return {"data": "failed", "error": "Failed to convert audio to mp3."}

    try:
        print("[DEBUG] Sending mp3 to transcription endpoint.")
        print(f"[DEBUG] Transcription API URL: {TELA_TRANSCRIPT_AUDIO_URL}")
        with open(mp3_file_path, 'rb') as f:
            files = {'file': f}
            response = requests.post(TELA_TRANSCRIPT_AUDIO_URL, files=files)

        print(f"[DEBUG] Response Status Code: {response.status_code}")
        print(f"[DEBUG] Response Text: {response.text}")

        if response.status_code == 200:
            print("[DEBUG] Successfully received transcription.")
            return response.json()
        else:
            print(f"[ERROR] Unexpected status code {response.status_code}: {response.text}")
            return {"data": "failed", "error": f"Error {response.status_code}: {response.text}"}

    except Exception as e:
        print(f"[ERROR] Exception during transcription: {e}")
        return {"data": "failed", "error": str(e)}
    finally:
        if mp3_file_path and os.path.exists(mp3_file_path):
            try:
                os.remove(mp3_file_path)
                print("[DEBUG] Temporary mp3 file deleted.")
            except OSError as e:
                print(f"[ERROR] Error deleting temporary file: {e}")

def extract_user_input(transcription_response):
    print("[DEBUG] Extracting user input from transcription response.")
    try:
        transcript_segments = transcription_response.get('result', [])
        user_input = "".join([segment['text'] for segment in transcript_segments])
        print(f"[DEBUG] Extracted user input: {user_input.strip()}")
        return user_input.strip()
    except KeyError as e:
        print(f"[ERROR] KeyError in transcription response: {e}")
        return ""

def generate_speech(text):
    print("[DEBUG] Generating speech from text.")
    tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    async def generate_tts():
        tts = edge_tts.Communicate(text, voice="pt-BR-AntonioNeural")
        await tts.save(tts_file.name)

    try:
        asyncio.run(generate_tts())
        print(f"[DEBUG] TTS audio saved to: {tts_file.name}")
        return tts_file.name
    except Exception as e:
        print(f"[ERROR] Error generating TTS: {e}")
        return None

def chatbot_conversation(audio_file_path, history):
    print("[DEBUG] Starting chatbot conversation.")
    try:
        transcription = transcript(audio_file_path)
        user_input = extract_user_input(transcription)

        if not user_input:
            print("[ERROR] No user input extracted from transcription.")
            yield "I could not generate the text. Please try again.", None, history
            return

        # Ensure we have a system_message
        system_message = system_instruction

        if history is None:
            history = []

        # Reconstruct messages from history
        messages = [{"role": "system", "content": system_message}]
        for turn in history:
            user_msg = turn[0].get("content") if turn[0] else ""
            assistant_msg = turn[1].get("content") if turn[1] else ""
            if user_msg:
                messages.append({"role": "user", "content": user_msg})
            if assistant_msg:
                messages.append({"role": "assistant", "content": assistant_msg})

        # Add the current user input
        messages.append({"role": "user", "content": user_input})

        print("[DEBUG] Sending request to sync_client for chat completion.")
        print(f"[DEBUG] Messages: {messages}")

        response = ""
        # Stream partial responses
        try:
            for message in sync_client.chat.completions.create(
                model=model,
                messages=messages,
                stream=True,
                max_tokens=1024,
                temperature=0,
                response_format={"type": "text"}
            ):
                token = message.choices[0].delta.content
                print(token)
                response += token
                # Yield partial text updates, no audio yet, history unchanged yet
                yield (response, None, history)
        except Exception as e:
            print(f"[ERROR] Error during streaming response: {e}")
            yield ("I could not understand you. Please try again.", None, history)
            return

        # Now that we have the full response, update history
        history.append([
            {"role": "user", "content": user_input},
            {"role": "assistant", "content": response}
        ])

        # Generate TTS now
        print("[DEBUG] Generating TTS for full response.")
        tts_file_name = generate_speech(response)
        if tts_file_name:
            print("[DEBUG] Returning final response and TTS file with updated history.")
            # Now yield again with final text, audio, and updated history
            yield (response, tts_file_name, history)
        else:
            print("[ERROR] Failed to generate TTS.")
            yield (response, None, history)

    except Exception as e:
        print(f"[ERROR] Exception in chatbot_conversation: {e}")
        yield ("I could not understand you. Please try again.", None, history)

# Three outputs here: transcription text, audio, and the updated history
interface = gr.Interface(
    fn=chatbot_conversation,
    inputs=[
        gr.Audio(label="Usuário", type="filepath", streaming=False, container=True),
        gr.State([])  # State holds the conversation history
    ],
    outputs=[
        gr.Textbox(label="Resposta do Chagas"),
        gr.Audio(type="filepath", autoplay=True, label="Chagas"),
        gr.State([])  # Return updated history
    ],
    title="Chagas - assistente de voz IA",
    description= description,
    theme="sudeepshouche/minimalist",
    live=True
)

interface.queue().launch()