File size: 3,268 Bytes
b9565e2
 
 
3af67f8
 
b9565e2
 
 
 
 
3af67f8
b9565e2
 
 
 
 
c82ae02
b9565e2
 
 
 
3af67f8
b9565e2
 
 
 
 
 
c82ae02
b9565e2
 
 
 
 
 
 
 
 
 
 
 
 
3af67f8
b9565e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c82ae02
b9565e2
 
 
 
 
 
 
 
 
c82ae02
b9565e2
3af67f8
9c2ab8c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import re
import uuid
import torch
import torchaudio
import soundfile as sf
from fastapi import FastAPI
from fastapi.responses import FileResponse
from pydantic import BaseModel
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.inference.speaker import EncoderClassifier

app = FastAPI()
device = "cuda" if torch.cuda.is_available() else "cpu"
CACHE_DIR = "/tmp/hf-cache"

# Load models (female only)
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)

# Speaker encoder
speaker_model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-xvect-voxceleb",
    run_opts={"device": device},
    savedir="/tmp/spk_model"
)

# Load female embedding only
def get_embedding(wav_path, pt_path):
    if os.path.exists(pt_path):
        return torch.load(pt_path).to(device)
    audio, sr = torchaudio.load(wav_path)
    audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
    with torch.no_grad():
        emb = speaker_model.encode_batch(audio)
        emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
    torch.save(emb.cpu(), pt_path)
    return emb

embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")

# Text normalization
number_words = {
    0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
    6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
    20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
    100: "boqol", 1000: "kun"
}

def number_to_words(n):
    if n < 20:
        return number_words.get(n, str(n))
    elif n < 100:
        tens, unit = divmod(n, 10)
        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
    elif n < 1000:
        hundreds, rem = divmod(n, 100)
        return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "")
    elif n < 1_000_000:
        th, rem = divmod(n, 1000)
        return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "")
    else:
        return str(n)

def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text)

def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Request schema without voice choice
class TTSRequest(BaseModel):
    text: str

@app.post("/speak")
def speak(payload: TTSRequest):
    clean_text = normalize_text(payload.text)
    inputs = processor(text=clean_text, return_tensors="pt").to(device)

    with torch.no_grad():
        waveform = model_female.generate_speech(inputs["input_ids"], embedding_female.unsqueeze(0), vocoder=vocoder)

    out_path = f"/tmp/{uuid.uuid4().hex}.wav"
    sf.write(out_path, waveform.cpu().numpy(), 16000)
    return FileResponse(out_path, media_type="audio/wav", filename="voice.wav")