Spaces:

Mistral-AI-Game-Jam
/

Team15

Running

Team15 / src /hackathon /speech /speech.py

Gabriel Vidal-Ayrinhac

process audio in memory

9d076e3 4 months ago

2.89 kB

	# Code copied from https://elevenlabs.io/docs/cookbooks/text-to-speech/streaming

	import base64
	from io import BytesIO
	from typing import IO

	import yaml
	from elevenlabs import VoiceSettings
	from elevenlabs.client import ElevenLabs

	from hackathon.config import settings

	client = ElevenLabs(api_key=settings.ELEVENLABS_API_KEY)

	voices = {"politician1": "ohZqJahxofk8dkPKmd9F", "politician2": "v7sy7EHXxN3ToffFQfvr"}
	# voice_id: "ohZqJahxofk8dkPKmd9F" # Another voice just in case


	def read_audio_config(yaml_path: str) -> dict:
	try:
	with open(yaml_path, "r") as file:
	config = yaml.safe_load(file)
	return config
	except FileNotFoundError:
	raise FileNotFoundError(f"The file at path '{yaml_path}' does not exist.")
	except yaml.YAMLError as e:
	raise ValueError(f"Error parsing YAML file: {e}")


	def read_audio_file(audio_path: str):
	with open(audio_path, "rb") as audio_file:
	audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
	return audio_base64


	def text_to_speech_file(
	text: str,
	voice_id: str,
	stability=0.5,
	similarity=1.0,
	style=0.3,
	base_path="audio_store",
	) -> str:
	"""voice: politician1 or politician2"""
	# Calling the text_to_speech conversion API with detailed parameters
	response = client.text_to_speech.convert(
	voice_id=voice_id, # Adam pre-made voice
	output_format="mp3_44100_32",
	text=text,
	model_id="eleven_turbo_v2_5", # use the turbo model for low latency
	voice_settings=VoiceSettings(
	stability=0.5,
	similarity_boost=1.0,
	style=0.3,
	use_speaker_boost=True,
	),
	)

	audio_data = BytesIO()
	for chunk in response:
	if chunk:
	audio_data.write(chunk)
	audio_data.seek(0)
	audio_base64 = base64.b64encode(audio_data.read()).decode("utf-8")
	return audio_base64


	def text_to_speech_stream(
	text: str, voice: str, stability=0.5, similarity=1.0, style=0.3
	) -> IO[bytes]:
	"""voice: politician1 or politician2"""
	# Perform the text-to-speech conversion
	response = client.text_to_speech.convert(
	voice_id=voices[voice], # Adam pre-made voice
	output_format="mp3_22050_32",
	text=text,
	model_id="eleven_multilingual_v2",
	voice_settings=VoiceSettings(
	stability=0.0,
	similarity_boost=1.0,
	style=0.0,
	use_speaker_boost=True,
	),
	)

	# Create a BytesIO object to hold the audio data in memory
	audio_stream = BytesIO()

	# Write each chunk of audio data to the stream
	for chunk in response:
	if chunk:
	audio_stream.write(chunk)

	# Reset stream position to the beginning
	audio_stream.seek(0)

	# Return the stream for further use
	return audio_stream