import time import torch from transformers import set_seed from transformers import VitsTokenizer, VitsModel def synthesize_facebook(s:str, iso3:str) -> str: ''' For given text, speak it. Parameters ---------- s: str The written text. is03:str The ISO-3 code of the text's language. Returns ---------- synth:str The synthesized audio. ''' # Ensure replicability set_seed(555) start_time = time.time() # Load synthesizer tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}") model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}") inputs = tokenizer(text=s, return_tensors="pt") # Inference with torch.no_grad(): outputs = model(**inputs) synth = outputs.waveform[0] print("Time elapsed: ", int(time.time() - start_time), " seconds") return synth.numpy()