Gabriel Vidal-Ayrinhac
process audio in memory
9d076e3
# Code copied from https://elevenlabs.io/docs/cookbooks/text-to-speech/streaming
import base64
from io import BytesIO
from typing import IO
import yaml
from elevenlabs import VoiceSettings
from elevenlabs.client import ElevenLabs
from hackathon.config import settings
client = ElevenLabs(api_key=settings.ELEVENLABS_API_KEY)
voices = {"politician1": "ohZqJahxofk8dkPKmd9F", "politician2": "v7sy7EHXxN3ToffFQfvr"}
# voice_id: "ohZqJahxofk8dkPKmd9F" # Another voice just in case
def read_audio_config(yaml_path: str) -> dict:
try:
with open(yaml_path, "r") as file:
config = yaml.safe_load(file)
return config
except FileNotFoundError:
raise FileNotFoundError(f"The file at path '{yaml_path}' does not exist.")
except yaml.YAMLError as e:
raise ValueError(f"Error parsing YAML file: {e}")
def read_audio_file(audio_path: str):
with open(audio_path, "rb") as audio_file:
audio_base64 = base64.b64encode(audio_file.read()).decode("utf-8")
return audio_base64
def text_to_speech_file(
text: str,
voice_id: str,
stability=0.5,
similarity=1.0,
style=0.3,
base_path="audio_store",
) -> str:
"""voice: politician1 or politician2"""
# Calling the text_to_speech conversion API with detailed parameters
response = client.text_to_speech.convert(
voice_id=voice_id, # Adam pre-made voice
output_format="mp3_44100_32",
text=text,
model_id="eleven_turbo_v2_5", # use the turbo model for low latency
voice_settings=VoiceSettings(
stability=0.5,
similarity_boost=1.0,
style=0.3,
use_speaker_boost=True,
),
)
audio_data = BytesIO()
for chunk in response:
if chunk:
audio_data.write(chunk)
audio_data.seek(0)
audio_base64 = base64.b64encode(audio_data.read()).decode("utf-8")
return audio_base64
def text_to_speech_stream(
text: str, voice: str, stability=0.5, similarity=1.0, style=0.3
) -> IO[bytes]:
"""voice: politician1 or politician2"""
# Perform the text-to-speech conversion
response = client.text_to_speech.convert(
voice_id=voices[voice], # Adam pre-made voice
output_format="mp3_22050_32",
text=text,
model_id="eleven_multilingual_v2",
voice_settings=VoiceSettings(
stability=0.0,
similarity_boost=1.0,
style=0.0,
use_speaker_boost=True,
),
)
# Create a BytesIO object to hold the audio data in memory
audio_stream = BytesIO()
# Write each chunk of audio data to the stream
for chunk in response:
if chunk:
audio_stream.write(chunk)
# Reset stream position to the beginning
audio_stream.seek(0)
# Return the stream for further use
return audio_stream