import os import tempfile import gradio as gr from TTS.api import TTS from TTS.utils.synthesizer import Synthesizer from huggingface_hub import hf_hub_download import json # Define constants MODEL_INFO = [ #["vits checkpoint 57000", "checkpoint_57000.pth", "config.json", "mhrahmani/persian-tts-vits-0"], # ["VITS Grapheme Multispeaker CV15(reduct)(best at 17864)", "best_model_17864.pth", "config.json", # "saillab/persian-tts-cv15-reduct-grapheme-multispeaker"], ["Single speaker (best)VITS Grapheme Azure (61000)", "checkpoint_61000.pth", "config.json", "saillab/persian-tts-azure-grapheme-60K"], #["VITS Grapheme ARM24 Fine-Tuned on 1 (66651)", "best_model_66651.pth", "config.json","saillab/persian-tts-grapheme-arm24-finetuned-on1"], ["Single speaker female best VITS Grapheme CV-Azure_male-Azure_female","best_model_15397.pth","config.json","saillab/female_cv_azure_male_azure_female","speakers1.pth"], #["Multi Speaker Vits Grapheme CV+Azure in one set ","best_model_358320.pth","config.json","saillab/Multi_Speaker_Cv_plus_Azure_female_in_one_set","speakers.pth"], ["Multispeaker VITS Grapheme CV15(reduct)(22000)", "checkpoint_22000.pth", "config.json", "saillab/persian-tts-cv15-reduct-grapheme-multispeaker", "speakers.pth"], ["Multispeaker VITS Grapheme CV15(reduct)(26000)", "checkpoint_25000.pth", "config.json", "saillab/persian-tts-cv15-reduct-grapheme-multispeaker", "speakers.pth"], ["Multispeaker VITS Grapheme CV15(90K)", "best_model_56960.pth", "config.json", "saillab/multi_speaker", "speakers.pth"], ["Single speaker female best VITS Grapheme CV-Azure_male-Azure_female","best_model_15397.pth","config.json","saillab/female_cv_azure_male_azure_female","speakers.pth"], # ["VITS Grapheme Azure (best at 15934)", "best_model_15934.pth", "config.json", # "saillab/persian-tts-azure-grapheme-60K"], ["Single speaker VITS Grapheme ARM24 Fine-Tuned on 1 (66651)", "best_model_66651.pth", "config.json","saillab/persian-tts-grapheme-arm24-finetuned-on1"], ["Single speaker VITS Grapheme ARM24 Fine-Tuned on 1 (120000)", "checkpoint_120000.pth", "config.json","saillab/persian-tts-grapheme-arm24-finetuned-on1"], # ... Add other models similarly ] # Extract model names from MODEL_INFO MODEL_NAMES = [info[0] for info in MODEL_INFO] MAX_TXT_LEN = 400 TOKEN = os.getenv('HUGGING_FACE_HUB_TOKEN') model_files = {} config_files = {} speaker_files = {} # Create a dictionary to store synthesizer objects for each model synthesizers = {} def update_config_speakers_file_recursive(config_dict, speakers_path): """Recursively update speakers_file keys in a dictionary.""" if "speakers_file" in config_dict: config_dict["speakers_file"] = speakers_path for key, value in config_dict.items(): if isinstance(value, dict): update_config_speakers_file_recursive(value, speakers_path) def update_config_speakers_file(config_path, speakers_path): """Update the config.json file to point to the correct speakers.pth file.""" # Load the existing config with open(config_path, 'r') as f: config = json.load(f) # Modify the speakers_file entry update_config_speakers_file_recursive(config, speakers_path) # Save the modified config with open(config_path, 'w') as f: json.dump(config, f, indent=4) # Download models and initialize synthesizers for info in MODEL_INFO: model_name, model_file, config_file, repo_name = info[:4] speaker_file = info[4] if len(info) == 5 else None # Check if speakers.pth is defined for the model print(f"|> Downloading: {model_name}") # Download model and config files model_files[model_name] = hf_hub_download(repo_id=repo_name, filename=model_file, use_auth_token=TOKEN) config_files[model_name] = hf_hub_download(repo_id=repo_name, filename=config_file, use_auth_token=TOKEN) # Download speakers.pth if it exists if speaker_file: speaker_files[model_name] = hf_hub_download(repo_id=repo_name, filename=speaker_file, use_auth_token=TOKEN) update_config_speakers_file(config_files[model_name], speaker_files[model_name]) # Update the config file print(speaker_files[model_name]) # Initialize synthesizer for the model synthesizer = Synthesizer( tts_checkpoint=model_files[model_name], tts_config_path=config_files[model_name], tts_speakers_file=speaker_files[model_name], # Pass the speakers.pth file if it exists use_cuda=False # Assuming you don't want to use GPU, adjust if needed ) elif speaker_file is None: # Initialize synthesizer for the model synthesizer = Synthesizer( tts_checkpoint=model_files[model_name], tts_config_path=config_files[model_name], # tts_speakers_file=speaker_files.get(model_name, None), # Pass the speakers.pth file if it exists use_cuda=False # Assuming you don't want to use GPU, adjust if needed ) synthesizers[model_name] = synthesizer #def synthesize(text: str, model_name: str, speaker_name="speaker-0") -> str: def synthesize(text: str, model_name: str, speaker_name=None) -> str: """Synthesize speech using the selected model.""" if len(text) > MAX_TXT_LEN: text = text[:MAX_TXT_LEN] print(f"Input text was cut off as it exceeded the {MAX_TXT_LEN} character limit.") # Use the synthesizer object for the selected model synthesizer = synthesizers[model_name] if synthesizer is None: raise NameError("Model not found") if synthesizer.tts_speakers_file is "": wavs = synthesizer.tts(text) elif synthesizer.tts_speakers_file is not "": if speaker_name == "": #wavs = synthesizer.tts(text, speaker_name="speaker-0") ## should change, better if gradio conditions are figure out. wavs = synthesizer.tts(text, speaker_name=None) else: wavs = synthesizer.tts(text, speaker_name=speaker_name) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: synthesizer.save_wav(wavs, fp) return fp.name # Callback function to update UI based on the selected model def update_options(model_name): synthesizer = synthesizers[model_name] # if synthesizer.tts.is_multi_speaker: if model_name is MODEL_NAMES[1]: speakers = synthesizer.tts_model.speaker_manager.speaker_names # return options for the dropdown return speakers else: # return empty options if not multi-speaker return [] # Create Gradio interface iface = gr.Interface( fn=synthesize, inputs=[ gr.Textbox(label="Enter Text to Synthesize:", value="زین همرهان سست عناصر، دلم گرفت."), gr.Radio(label="Pick a Model", choices=MODEL_NAMES, value=MODEL_NAMES[0], type="value"), #gr.Dropdown(label="Select Speaker", choices=update_options(MODEL_NAMES[1]), type="value", default="speaker-0") gr.Dropdown(label="Select Speaker", choices=update_options(MODEL_NAMES[1]), type="value", default=None) ], outputs=gr.Audio(label="Output", type='filepath'), examples=[["زین همرهان سست عناصر، دلم گرفت.", MODEL_NAMES[0], ""]], # Example should include a speaker name for multispeaker models title='Persian TTS Playground', description=""" ### Persian text to speech model demo. #### Pick a speaker for MultiSpeaker models. (for single speaker go for speaker-0) """, article="", live=False ) iface.launch()