Spaces:
Sleeping
Sleeping
import numpy as np | |
from fastapi import HTTPException | |
from modules.api.impl.handler.AudioHandler import AudioHandler | |
from modules.api.impl.model.audio_model import AdjustConfig | |
from modules.api.impl.model.chattts_model import InferConfig | |
from modules.api.impl.model.enhancer_model import EnhancerConfig | |
from modules.Enhancer.ResembleEnhance import apply_audio_enhance_full | |
from modules.normalization import text_normalize | |
from modules.ssml_parser.SSMLParser import create_ssml_parser | |
from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments | |
from modules.utils import audio | |
class SSMLHandler(AudioHandler): | |
def __init__( | |
self, | |
ssml_content: str, | |
infer_config: InferConfig, | |
adjust_config: AdjustConfig, | |
enhancer_config: EnhancerConfig, | |
) -> None: | |
assert isinstance(ssml_content, str), "ssml_content must be a string." | |
assert isinstance( | |
infer_config, InferConfig | |
), "infer_config must be an InferConfig object." | |
assert isinstance( | |
adjust_config, AdjustConfig | |
), "adjest_config should be AdjustConfig" | |
assert isinstance( | |
enhancer_config, EnhancerConfig | |
), "enhancer_config must be an EnhancerConfig object." | |
self.ssml_content = ssml_content | |
self.infer_config = infer_config | |
self.adjest_config = adjust_config | |
self.enhancer_config = enhancer_config | |
self.validate() | |
def validate(self): | |
# TODO params checker | |
pass | |
def enqueue(self) -> tuple[np.ndarray, int]: | |
ssml_content = self.ssml_content | |
infer_config = self.infer_config | |
adjust_config = self.adjest_config | |
enhancer_config = self.enhancer_config | |
parser = create_ssml_parser() | |
segments = parser.parse(ssml_content) | |
for seg in segments: | |
seg["text"] = text_normalize(seg["text"], is_end=True) | |
if len(segments) == 0: | |
raise HTTPException( | |
status_code=422, detail="The SSML text is empty or parsing failed." | |
) | |
synthesize = SynthesizeSegments( | |
batch_size=infer_config.batch_size, | |
eos=infer_config.eos, | |
spliter_thr=infer_config.spliter_threshold, | |
) | |
audio_segments = synthesize.synthesize_segments(segments) | |
combined_audio = combine_audio_segments(audio_segments) | |
sample_rate, audio_data = audio.pydub_to_np(combined_audio) | |
if enhancer_config.enabled: | |
nfe = enhancer_config.nfe | |
solver = enhancer_config.solver | |
lambd = enhancer_config.lambd | |
tau = enhancer_config.tau | |
audio_data, sample_rate = apply_audio_enhance_full( | |
audio_data=audio_data, | |
sr=sample_rate, | |
nfe=nfe, | |
solver=solver, | |
lambd=lambd, | |
tau=tau, | |
) | |
audio_data = audio.apply_prosody_to_audio_data( | |
audio_data=audio_data, | |
rate=adjust_config.speed_rate, | |
pitch=adjust_config.pitch, | |
volume=adjust_config.volume_gain_db, | |
sr=sample_rate, | |
) | |
if adjust_config.normalize: | |
sample_rate, audio_data = audio.apply_normalize( | |
audio_data=audio_data, headroom=adjust_config.headroom, sr=sample_rate | |
) | |
return audio_data, sample_rate | |