zhzluke96
update
bed01bd
raw
history blame
3.45 kB
import numpy as np
from fastapi import HTTPException
from modules.api.impl.handler.AudioHandler import AudioHandler
from modules.api.impl.model.audio_model import AdjustConfig
from modules.api.impl.model.chattts_model import InferConfig
from modules.api.impl.model.enhancer_model import EnhancerConfig
from modules.Enhancer.ResembleEnhance import apply_audio_enhance_full
from modules.normalization import text_normalize
from modules.ssml_parser.SSMLParser import create_ssml_parser
from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments
from modules.utils import audio
class SSMLHandler(AudioHandler):
def __init__(
self,
ssml_content: str,
infer_config: InferConfig,
adjust_config: AdjustConfig,
enhancer_config: EnhancerConfig,
) -> None:
assert isinstance(ssml_content, str), "ssml_content must be a string."
assert isinstance(
infer_config, InferConfig
), "infer_config must be an InferConfig object."
assert isinstance(
adjust_config, AdjustConfig
), "adjest_config should be AdjustConfig"
assert isinstance(
enhancer_config, EnhancerConfig
), "enhancer_config must be an EnhancerConfig object."
self.ssml_content = ssml_content
self.infer_config = infer_config
self.adjest_config = adjust_config
self.enhancer_config = enhancer_config
self.validate()
def validate(self):
# TODO params checker
pass
def enqueue(self) -> tuple[np.ndarray, int]:
ssml_content = self.ssml_content
infer_config = self.infer_config
adjust_config = self.adjest_config
enhancer_config = self.enhancer_config
parser = create_ssml_parser()
segments = parser.parse(ssml_content)
for seg in segments:
seg["text"] = text_normalize(seg["text"], is_end=True)
if len(segments) == 0:
raise HTTPException(
status_code=422, detail="The SSML text is empty or parsing failed."
)
synthesize = SynthesizeSegments(
batch_size=infer_config.batch_size,
eos=infer_config.eos,
spliter_thr=infer_config.spliter_threshold,
)
audio_segments = synthesize.synthesize_segments(segments)
combined_audio = combine_audio_segments(audio_segments)
sample_rate, audio_data = audio.pydub_to_np(combined_audio)
if enhancer_config.enabled:
nfe = enhancer_config.nfe
solver = enhancer_config.solver
lambd = enhancer_config.lambd
tau = enhancer_config.tau
audio_data, sample_rate = apply_audio_enhance_full(
audio_data=audio_data,
sr=sample_rate,
nfe=nfe,
solver=solver,
lambd=lambd,
tau=tau,
)
audio_data = audio.apply_prosody_to_audio_data(
audio_data=audio_data,
rate=adjust_config.speed_rate,
pitch=adjust_config.pitch,
volume=adjust_config.volume_gain_db,
sr=sample_rate,
)
if adjust_config.normalize:
sample_rate, audio_data = audio.apply_normalize(
audio_data=audio_data, headroom=adjust_config.headroom, sr=sample_rate
)
return audio_data, sample_rate