import time | |
import torch | |
from transformers import set_seed | |
from transformers import VitsTokenizer, VitsModel | |
def synthesize_facebook(s:str, iso3:str) -> str: | |
''' | |
For given text, speak it. | |
Parameters | |
---------- | |
s: str | |
The written text. | |
is03:str | |
The ISO-3 code of the text's language. | |
Returns | |
---------- | |
synth:str | |
The synthesized audio. | |
''' | |
# Ensure replicability | |
set_seed(555) | |
start_time = time.time() | |
# Load synthesizer | |
tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}") | |
model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}") | |
inputs = tokenizer(text=s, return_tensors="pt") | |
# Inference | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
synth = outputs.waveform[0] | |
print("Time elapsed: ", int(time.time() - start_time), " seconds") | |
return synth.numpy() |