audio-to-text

Runtime error

App Files Files Community

audio-to-text / app.py

rodrigomasini

Update app.py

b81e668 verified 7 months ago

raw

history blame contribute delete

8.34 kB

	import os
	import re
	import gradio as gr
	import tempfile
	from pydub import AudioSegment
	from pydub.utils import which
	import edge_tts
	import asyncio
	import nest_asyncio
	import requests
	nest_asyncio.apply()
	from openai import OpenAI


	secret=os.getenv("SECRET")
	url=os.getenv("SRVC")
	url_audio=os.getenv("TRANSCRIPTION")
	key=os.getenv("KEY")

	description = """
	<center>
	<img src="https://huggingface.co/spaces/rodrigomasini/audio-to-text/resolve/main/chagas.png" width=200px>
	<strong>Primeiro assistente de IA de voz do Brasil</strong>
	</center>
	"""

	OPENAI_API_KEY = secret
	sync_client = OpenAI(
	base_url=url,
	api_key=key
	)

	# Ensuring pydub can locate ffmpeg
	AudioSegment.converter = which("ffmpeg")

	# TELA endpoint for speech-to-text generation
	TELA_TRANSCRIPT_AUDIO_URL = url_audio

	system_instruction = """
	A partir de agora, o seu nome é Chagas, um assistente virtual de saúde que fala português.
	Durante a interação com o usuário, você deve responder e manter a conversa de forma amigável, concisa, clara e aberta.
	Evite qualquer introdução desnecessária.
	Responda em um tom amigável de conversação e sempre empático e suportivo.
	Nunca retorne a sua resposta em formato Markdown. E sempre, sempre retorne na forma de frases, mesmo se a sua resposta for uma lista.
	Novamente, apenas frases, mesmo que você queira realçar várias etapas como uma lista numerada e colocando markdown com asteriscos, não o faça! Apenas frases.
	"""

	def convert_to_mp3(audio_file_path):
	print("[DEBUG] Starting audio conversion to mp3.")
	temp_mp3 = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
	try:
	audio = AudioSegment.from_file(audio_file_path)
	audio.export(temp_mp3.name, format="mp3")
	print(f"[DEBUG] Successfully converted to mp3: {temp_mp3.name}")
	return temp_mp3.name
	except Exception as e:
	print(f"[ERROR] Error converting audio: {e}")
	return None

	def transcript(audio_file_path):
	print("[DEBUG] Starting transcription process.")
	if audio_file_path is None:
	print("[ERROR] No audio file provided.")
	return {"data": "failed", "error": "No audio file provided."}

	mp3_file_path = convert_to_mp3(audio_file_path)
	if not mp3_file_path:
	print("[ERROR] Failed to convert audio to mp3.")
	return {"data": "failed", "error": "Failed to convert audio to mp3."}

	try:
	print("[DEBUG] Sending mp3 to transcription endpoint.")
	print(f"[DEBUG] Transcription API URL: {TELA_TRANSCRIPT_AUDIO_URL}")
	with open(mp3_file_path, 'rb') as f:
	files = {'file': f}
	response = requests.post(TELA_TRANSCRIPT_AUDIO_URL, files=files)

	print(f"[DEBUG] Response Status Code: {response.status_code}")
	print(f"[DEBUG] Response Text: {response.text}")

	if response.status_code == 200:
	print("[DEBUG] Successfully received transcription.")
	return response.json()
	else:
	print(f"[ERROR] Unexpected status code {response.status_code}: {response.text}")
	return {"data": "failed", "error": f"Error {response.status_code}: {response.text}"}

	except Exception as e:
	print(f"[ERROR] Exception during transcription: {e}")
	return {"data": "failed", "error": str(e)}
	finally:
	if mp3_file_path and os.path.exists(mp3_file_path):
	try:
	os.remove(mp3_file_path)
	print("[DEBUG] Temporary mp3 file deleted.")
	except OSError as e:
	print(f"[ERROR] Error deleting temporary file: {e}")

	def extract_user_input(transcription_response):
	print("[DEBUG] Extracting user input from transcription response.")
	try:
	transcript_segments = transcription_response.get('result', [])
	user_input = "".join([segment['text'] for segment in transcript_segments])
	print(f"[DEBUG] Extracted user input: {user_input.strip()}")
	return user_input.strip()
	except KeyError as e:
	print(f"[ERROR] KeyError in transcription response: {e}")
	return ""

	def generate_speech(text):
	print("[DEBUG] Generating speech from text.")
	tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
	async def generate_tts():
	tts = edge_tts.Communicate(text, voice="pt-BR-AntonioNeural")
	await tts.save(tts_file.name)

	try:
	asyncio.run(generate_tts())
	print(f"[DEBUG] TTS audio saved to: {tts_file.name}")
	return tts_file.name
	except Exception as e:
	print(f"[ERROR] Error generating TTS: {e}")
	return None

	def chatbot_conversation(audio_file_path, history):
	print("[DEBUG] Starting chatbot conversation.")
	try:
	transcription = transcript(audio_file_path)
	user_input = extract_user_input(transcription)

	if not user_input:
	print("[ERROR] No user input extracted from transcription.")
	yield "I could not generate the text. Please try again.", None, history
	return

	# Ensure we have a system_message
	system_message = system_instruction

	if history is None:
	history = []

	# Reconstruct messages from history
	messages = [{"role": "system", "content": system_message}]
	for turn in history:
	user_msg = turn[0].get("content") if turn[0] else ""
	assistant_msg = turn[1].get("content") if turn[1] else ""
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	# Add the current user input
	messages.append({"role": "user", "content": user_input})

	print("[DEBUG] Sending request to sync_client for chat completion.")
	print(f"[DEBUG] Messages: {messages}")

	response = ""
	# Stream partial responses
	try:
	for message in sync_client.chat.completions.create(
	model="marco-o1",
	messages=messages,
	stream=True,
	max_tokens=1024,
	temperature=0,
	response_format={"type": "text"}
	):
	token = message.choices[0].delta.content
	if token:
	token = token.replace("<\|im_start\|>", "").replace("<\|im_end\|>", "")
	print(token, end="")
	response += token
	# Yield partial text updates, no audio yet, history unchanged yet
	yield (response, None, history)
	except Exception as e:
	print(f"[ERROR] Error during streaming response: {e}")
	yield ("I could not understand you. Please try again.", None, history)
	return

	# Now that we have the full response, update history
	history.append([
	{"role": "user", "content": user_input},
	{"role": "assistant", "content": response}
	])

	# Generate TTS now
	print("[DEBUG] Generating TTS for full response.")
	tts_file_name = generate_speech(response)
	if tts_file_name:
	print("[DEBUG] Returning final response and TTS file with updated history.")
	# Now yield again with final text, audio, and updated history
	yield (response, tts_file_name, history)
	else:
	print("[ERROR] Failed to generate TTS.")
	yield (response, None, history)

	except Exception as e:
	print(f"[ERROR] Exception in chatbot_conversation: {e}")
	yield ("I could not understand you. Please try again.", None, history)

	# Three outputs here: transcription text, audio, and the updated history
	interface = gr.Interface(
	fn=chatbot_conversation,
	inputs=[
	gr.Audio(label="Usuário", type="filepath", streaming=False, container=True),
	gr.State([]) # State holds the conversation history
	],
	outputs=[
	gr.Textbox(label="Resposta do Chagas"),
	gr.Audio(type="filepath", autoplay=True, label="Chagas"),
	gr.State([]) # Return updated history
	],
	title="Chagas - assistente de saúde",
	description= description,
	theme="sudeepshouche/minimalist",
	live=True
	)

	interface.queue().launch()