Spaces:

Somalitts
/

labadaba_do

Running

File size: 3,268 Bytes

import os
import re
import uuid
import torch
import torchaudio
import soundfile as sf
from fastapi import FastAPI
from fastapi.responses import FileResponse
from pydantic import BaseModel
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.inference.speaker import EncoderClassifier

app = FastAPI()
device = "cuda" if torch.cuda.is_available() else "cpu"
CACHE_DIR = "/tmp/hf-cache"

# Load models (female only)
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)

# Speaker encoder
speaker_model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-xvect-voxceleb",
    run_opts={"device": device},
    savedir="/tmp/spk_model"
)

# Load female embedding only
def get_embedding(wav_path, pt_path):
    if os.path.exists(pt_path):
        return torch.load(pt_path).to(device)
    audio, sr = torchaudio.load(wav_path)
    audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
    with torch.no_grad():
        emb = speaker_model.encode_batch(audio)
        emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
    torch.save(emb.cpu(), pt_path)
    return emb

embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")

# Text normalization
number_words = {
    0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
    6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
    20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
    100: "boqol", 1000: "kun"
}

def number_to_words(n):
    if n < 20:
        return number_words.get(n, str(n))
    elif n < 100:
        tens, unit = divmod(n, 10)
        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
    elif n < 1000:
        hundreds, rem = divmod(n, 100)
        return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "")
    elif n < 1_000_000:
        th, rem = divmod(n, 1000)
        return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "")
    else:
        return str(n)

def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)

def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Request schema without voice choice
class TTSRequest(BaseModel):
    text: str

@app.post("/speak")
def speak(payload: TTSRequest):
    clean_text = normalize_text(payload.text)
    inputs = processor(text=clean_text, return_tensors="pt").to(device)

    with torch.no_grad():
        waveform = model_female.generate_speech(inputs["input_ids"], embedding_female.unsqueeze(0), vocoder=vocoder)

    out_path = f"/tmp/{uuid.uuid4().hex}.wav"
    sf.write(out_path, waveform.cpu().numpy(), 16000)
    return FileResponse(out_path, media_type="audio/wav", filename="voice.wav")