import gradio as gr import base64 import numpy as np from scipy.io import wavfile from voice_processing import tts, get_model_names, voice_mapping from io import BytesIO import asyncio from pydub import AudioSegment async def convert_tts(model_name, tts_text, selected_voice, slang_rate, use_uploaded_voice, voice_upload): edge_tts_voice = voice_mapping.get(selected_voice) if not edge_tts_voice: return {"error": f"Invalid voice '{selected_voice}'."}, None voice_upload_file = None if use_uploaded_voice and voice_upload is not None: with open(voice_upload.name, 'rb') as f: voice_upload_file = f.read() # Process the text input or uploaded voice info, edge_tts_output_path, tts_output_data, edge_output_file = await tts( model_name, tts_text, edge_tts_voice, slang_rate, use_uploaded_voice, voice_upload_file ) _, audio_output = tts_output_data # Return audio data as bytes audio_bytes = None if isinstance(audio_output, np.ndarray): byte_io = BytesIO() wavfile.write(byte_io, 40000, audio_output) byte_io.seek(0) audio_segment = AudioSegment.from_wav(byte_io) mp3_bytes = audio_segment.export(format="mp3").read() audio_bytes = mp3_bytes else: audio_segment = AudioSegment.from_file(BytesIO(audio_output), format="wav") mp3_bytes = audio_segment.export(format="mp3").read() audio_bytes = mp3_bytes return audio_bytes def get_models(): return get_model_names() def get_voices(): return list(voice_mapping.keys()) iface = gr.Interface( fn=convert_tts, inputs=[ gr.Dropdown(choices=get_models(), label="Model", interactive=True), gr.Textbox(label="Text", placeholder="Enter text here"), gr.Dropdown(choices=get_voices(), label="Voice", interactive=True), gr.Slider(minimum=0, maximum=1, step=0.01, label="Slang Rate"), gr.Checkbox(label="Use Uploaded Voice"), gr.File(label="Voice File") ], outputs=[ gr.Audio(label="Result Audio",type="filepath") ], title="Text-to-Speech Conversion" ) iface.launch()