anyantudre's picture
Upload 5 files
e41ca58 verified
import time
import torch
from transformers import set_seed
from transformers import VitsTokenizer, VitsModel
def synthesize_facebook(s:str, iso3:str) -> str:
'''
For given text, speak it.
Parameters
----------
s: str
The written text.
is03:str
The ISO-3 code of the text's language.
Returns
----------
synth:str
The synthesized audio.
'''
# Ensure replicability
set_seed(555)
start_time = time.time()
# Load synthesizer
tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}")
model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}")
inputs = tokenizer(text=s, return_tensors="pt")
# Inference
with torch.no_grad():
outputs = model(**inputs)
synth = outputs.waveform[0]
print("Time elapsed: ", int(time.time() - start_time), " seconds")
return synth.numpy()