File size: 1,466 Bytes
f1a1bc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from typing import Dict
from pathlib import Path
import tempfile
import torch
import torchaudio
import librosa

SAMPLE_RATE = 16000

class EndpointHandler():
    def __init__(self, path=""):
        # Load the MARS5 model
        self.mars5, self.config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)

    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (Dict[str, bytes]):
                Includes the text, audio file path, and transcript.
        Returns:
            Dict[str, str]: Path to the synthesized audio file.
        """
        # Process input
        text = data["text"]
        audio_file = data["audio_file"]
        transcript = data["transcript"]

        # Load the reference audio
        wav, sr = librosa.load(audio_file, sr=self.mars5.sr, mono=True)
        wav = torch.from_numpy(wav)

        # Define the configuration for the TTS model
        deep_clone = True
        cfg = self.config_class(deep_clone=deep_clone, rep_penalty_window=100, top_k=100, temperature=0.7, freq_penalty=3)

        # Generate the synthesized audio
        ar_codes, wav_out = self.mars5.tts(text, wav, transcript, cfg=cfg)

        # Save the synthesized audio to a temporary file
        output_path = Path(tempfile.mktemp(suffix=".wav"))
        torchaudio.save(output_path, wav_out.unsqueeze(0), self.mars5.sr)

        return {"synthesized_audio": str(output_path)}