File size: 1,267 Bytes
ce63f6f d29fa84 ce63f6f d29fa84 ce63f6f d29fa84 ce63f6f d29fa84 ce63f6f d29fa84 ce63f6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
from transformers import VitsModel, AutoTokenizer
import torch
from umsc import UgMultiScriptConverter
import scipy.io.wavfile
# Model ID and setup
model_id = "facebook/mms-tts-uig-script_arabic"
tts_tokenizer = AutoTokenizer.from_pretrained(model_id)
tts_model = VitsModel.from_pretrained(model_id)
# Automatically allocate the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tts_model = tts_model.to(device)
def generate_audio(input_text, script):
"""
Generate audio for the given input text and script
"""
# Convert text to Uyghur Arabic if needed
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
if script != "Uyghur Arabic":
input_text = ug_latn_to_arab(input_text)
# Tokenize and move inputs to the same device as the model
tts_inputs = tts_tokenizer(input_text, return_tensors="pt").to(device)
# Perform inference
with torch.no_grad():
tts_output = tts_model(**tts_inputs).waveform.cpu() # Move output back to CPU for saving
# Save to a temporary file
output_path = "tts_output.wav"
sample_rate = 16000
scipy.io.wavfile.write(output_path, rate=sample_rate, data=tts_output.numpy()[0])
# Return the audio file path
return output_path |