metaambod / tts.py
unijoh's picture
Update tts.py
58fd969 verified
raw
history blame
No virus
1.4 kB
import torch
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
import logging
import numpy as np
import soundfile as sf
import torchaudio
# Set up logging
logging.basicConfig(level=logging.DEBUG)
MODEL_ID = "microsoft/speecht5_tts"
# Try to load the model and processor
try:
processor = SpeechT5Processor.from_pretrained(MODEL_ID)
model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_ID)
logging.info("Model and processor loaded successfully.")
except Exception as e:
logging.error(f"Error loading model or processor: {e}")
def synthesize_speech(text):
try:
inputs = processor(text, return_tensors="pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = inputs.to(device)
with torch.no_grad():
speech = model.generate(**inputs)
logging.info("Speech generated successfully.")
# Decode the generated speech and save to an audio file
waveform = speech.cpu().numpy().flatten()
# Use torchaudio to save the waveform
file_path = "output.wav"
torchaudio.save(file_path, torch.tensor(waveform).unsqueeze(0), 16000)
logging.info(f"Audio file saved successfully at {file_path}.")
return file_path
except Exception as e:
logging.error(f"Error during speech synthesis: {e}")
return None