metaambod / tts.py
unijoh's picture
Update tts.py
ed95412 verified
raw
history blame
No virus
1.59 kB
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import logging
import numpy as np
import soundfile as sf
from huggingface_hub import hf_hub_download
# Set up logging
logging.basicConfig(level=logging.DEBUG)
MODEL_ID = "facebook/mms-tts-fao"
# Try to load the model and processor
try:
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
logging.info("Model and processor loaded successfully.")
except Exception as e:
logging.error(f"Error loading model or processor: {e}")
raise
def synthesize_speech(text):
try:
# Ensure text is not empty
if not text.strip():
logging.error("Text input is empty.")
return None
inputs = processor(text, return_tensors="pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = inputs.to(device)
with torch.no_grad():
speech = model.generate(**inputs)
logging.info("Speech generated successfully.")
# Decode the generated speech and save to an audio file
waveform = speech.cpu().numpy().flatten()
# Normalize waveform to the range [-1, 1]
waveform = np.clip(waveform, -1.0, 1.0)
# Convert waveform to audio format that Gradio can handle
audio_path = "output.wav"
sf.write(audio_path, waveform, 16000)
return audio_path
except Exception as e:
logging.error(f"Error during speech synthesis: {e}")
return None