import os import gradio as gr import requests import langid import base64 import json import time API_URL = os.environ.get("API_URL") supported_languages = ['zh', 'en'] output_dir = 'outputs' os.makedirs(output_dir, exist_ok=True) def audio_to_base64(audio_file): with open(audio_file, "rb") as audio_file: audio_data = audio_file.read() base64_data = base64.b64encode(audio_data).decode("utf-8") return base64_data def predict(prompt, style, audio_file_pth, agree): # initialize a empty info text_hint = '' # agree with the terms if agree == False: text_hint += '[ERROR] Please accept the Terms & Condition!\n' gr.Warning("Please accept the Terms & Condition!") return ( text_hint, None, None, ) # first detect the input language language_predicted = langid.classify(prompt)[0].strip() print(f"Detected language:{language_predicted}") if language_predicted not in supported_languages: text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n" gr.Warning( f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}" ) return ( text_hint, None, None, ) if language_predicted == "en": if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']: text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n" gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']") return ( text_hint, None, None, ) style = 'en_' + style prompt_length = len(prompt.split(' ')) else: if style not in ['default']: text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n" gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']") return ( text_hint, None, None, ) style = 'cn_' + style prompt_length = len(prompt) speaker_wav = audio_file_pth if prompt_length < 2: text_hint += f"[ERROR] Please give a longer prompt text \n" gr.Warning("Please give a longer prompt text") return ( text_hint, None, None, ) if prompt_length > 50: text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n" gr.Warning( "Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo for your usage" ) return ( text_hint, None, None, ) save_path = f'{output_dir}/output.wav' speaker_audio_base64 = audio_to_base64(speaker_wav) data = { "text": prompt, "reference_speaker": speaker_audio_base64, "emotion": style } start = time.time() # Send the data as a POST request response = requests.post(API_URL, json=data, timeout=60) print(f'Get response successfully within {time.time() - start}') try: json_data = json.loads(response) text_hint += f"[ERROR] {json_data['error']} \n" gr.Warning( f"[ERROR] {json_data['error']} \n" ) return ( text_hint, None, None, ) except: with open(save_path, 'wb') as f: f.write(response) # Check the response if response.status_code == 200: try: json_data = json.loads(response.content) text_hint += f"[ERROR] {json_data['error']} \n" gr.Warning( f"[ERROR] {json_data['error']} \n" ) return ( text_hint, None, None, ) except: with open(save_path, 'wb') as f: f.write(response.content) else: text_hint += f"[HTTP ERROR] {response.status_code} - {response.text} \n" gr.Warning( f"[HTTP ERROR] {response.status_code} - {response.text} \n" ) return ( text_hint, None, None, ) text_hint += f'''Get response successfully \n''' return ( text_hint, save_path, speaker_wav, ) title = "MyShell OpenVoice" description = """ We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set. """ markdown_table = """