import os import gradio as gr import requests import langid import base64 import json import time import re import hashlib import hash_code_for_cached_output API_URL = os.environ.get("API_URL") supported_languages = ['zh', 'en', 'ja', 'ko', 'es', 'fr'] supported_styles = { 'zh': "zh_default", 'en': [ "en_default", "en_us", "en_br", "en_au", "en_in" ], "es": "es_default", "fr": "fr_default", "ja": "jp_default", "ko": "kr_default" } output_dir = 'outputs' os.makedirs(output_dir, exist_ok=True) def audio_to_base64(audio_file): with open(audio_file, "rb") as audio_file: audio_data = audio_file.read() base64_data = base64.b64encode(audio_data).decode("utf-8") return base64_data def count_chars_words(sentence): segments = re.findall(r'[\u4e00-\u9fa5]+|\w+', sentence) char_count = 0 word_count = 0 for segment in segments: if re.match(r'[\u4e00-\u9fa5]+', segment): char_count += len(segment) else: word_count += len(segment.split()) return char_count + word_count def predict(prompt, style, audio_file_pth, speed, agree): # initialize a empty info text_hint = '' # agree with the terms if agree == False: text_hint += '[ERROR] Please accept the Terms & Condition!\n' gr.Warning("Please accept the Terms & Condition!") return ( text_hint, None, None, ) # Before we get into inference, we will detect if it is from example table or default value # If so, we use a cached Audio. Noted that, it is just for demo efficiency. # hash code were generated by `hash_code_for_cached_output.py` # this hash get from gradio console cached_outputs = { "af39e1f1ff_60565a5c20_en_us" : "cached_outputs/0.wav", "af39e1f1ff_420ab8211d_en_us" : "cached_outputs/1.wav", "ced034cc22_0f96bf44f5_es_default" : "cached_outputs/2.wav", "d3172b178d_3fef5adc6f_zh_default" : "cached_outputs/3.wav", "cda6998e1a_9897b60a4e_jp_default" : "cached_outputs/4.wav" } unique_code = hash_code_for_cached_output.get_unique_code(audio_file_pth, prompt, style) print("audio_file_pth is", audio_file_pth) print("unique_code is", unique_code) if unique_code in list(cached_outputs.keys()): return ( 'We get the cached output for you, since you are trying to generate an example cloning.', cached_outputs[unique_code], audio_file_pth, ) # first detect the input language language_predicted = langid.classify(prompt)[0].strip() print(f"Detected language:{language_predicted}") if language_predicted not in supported_languages: text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n" gr.Warning( f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}" ) return ( text_hint, None, None, ) # check the style if style not in supported_styles[language_predicted]: text_hint += f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.\n" gr.Warning(f"[Warming] The style {style} is not supported for detected language {language_predicted}. For language {language_predicted}, we support styles: {supported_styles[language_predicted]}. Using the wrong style may result in unexpected behavior.") prompt_length = count_chars_words(prompt) speaker_wav = audio_file_pth if prompt_length < 2: text_hint += f"[ERROR] Please give a longer prompt text \n" gr.Warning("Please give a longer prompt text") return ( text_hint, None, None, ) if prompt_length > 50000: text_hint += f"[ERROR] Text length limited to 50 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749 \n" gr.Warning( "Text length limited to 50000 words for this demo, please try shorter text. You can clone our open-source repo or try it on our website https://app.myshell.ai/robot-workshop/widget/174760057433406749" ) return ( text_hint, None, None, ) save_path = f'{output_dir}/output.wav' speaker_audio_base64 = audio_to_base64(speaker_wav) if style == 'en_us': # we update us accent style = 'en_newest' data = { "text": prompt, "reference_speaker": speaker_audio_base64, "language": style, "speed": speed } start = time.time() # Send the data as a POST request response = requests.post(API_URL, json=data, timeout=60) print(f'Get response successfully within {time.time() - start}') # Check the response if response.status_code == 200: try: json_data = json.loads(response.content) text_hint += f"[ERROR] {json_data['error']} \n" gr.Warning( f"[ERROR] {json_data['error']} \n" ) return ( text_hint, None, None, ) except: with open(save_path, 'wb') as f: f.write(response.content) else: text_hint += f"[HTTP ERROR] {response.status_code} - {response.text} \n" gr.Warning( f"[HTTP ERROR] {response.status_code} - {response.text} \n" ) return ( text_hint, None, None, ) text_hint += f'''Get response successfully \n''' return ( text_hint, save_path, speaker_wav, ) title = "MyShell OpenVoice V2" description = """ In December 2023, we released [OpenVoice V1](https://huggingface.co/spaces/myshell-ai/OpenVoice), an instant voice cloning approach that replicates a speaker's voice and generates speech in multiple languages using only a short audio clip. OpenVoice V1 enables granular control over voice styles, replicates the tone color of the reference speaker and achieves zero-shot cross-lingual voice cloning. """ description_v2 = """ In April 2024, we released **OpenVoice V2**, which includes all features in V1 and has: - **Better Audio Quality**. OpenVoice V2 adopts a different training strategy that delivers better audio quality. - **Native Multi-lingual Support**. English, Spanish, French, Chinese, Japanese and Korean are natively supported in OpenVoice V2. - **Free Commercial Use**. Starting from April 2024, both V2 and V1 are released under MIT License. Free for commercial use. """ markdown_table = """