from transformers import VitsModel, AutoTokenizer import torch import scipy.io.wavfile import util # Load processor and model models_info = { "Meta-MMS": { "processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"), "model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"), "arabic_script": True }, } device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def synthesize(text, model_id): if models_info[model_id]["arabic_script"]: text = util.ug_latn_to_arab(text) processor = models_info[model_id]["processor"] model = models_info[model_id]["model"].to(device) inputs = processor(text, return_tensors="pt").to(device) with torch.no_grad(): output = model(**inputs).waveform.cpu() # Move output back to CPU for saving output_path = "tts_output.wav" sample_rate = model.config.sampling_rate scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0]) return output_path