File size: 935 Bytes
a84c313 e5e9b34 a84c313 e5e9b34 a84c313 e5e9b34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import time
import torch
from transformers import set_seed
from transformers import VitsTokenizer, VitsModel
def synthesize_facebook(s:str, iso3:str) -> str:
'''
For given text, speak it.
Parameters
----------
s: str
The written text.
is03:str
The ISO-3 code of the text's language.
Returns
----------
synth:str
The synthesized audio.
'''
# Ensure replicability
set_seed(555)
start_time = time.time()
# Load synthesizer
tokenizer = VitsTokenizer.from_pretrained(f"facebook/mms-tts-{iso3}")
model = VitsModel.from_pretrained(f"facebook/mms-tts-{iso3}")
inputs = tokenizer(text=s, return_tensors="pt")
# Inference
with torch.no_grad():
outputs = model(**inputs)
synth = outputs.waveform[0]
print("Time elapsed: ", int(time.time() - start_time), " seconds")
return synth.numpy() |