File size: 5,494 Bytes
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
 
 
 
 
 
58d9769
3ed3b5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60bd17d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import time
import logging
import soundfile as sf
import numpy as np
from abc import ABC, abstractmethod
from typing import Tuple, Generator, Optional

# Configure logging
logger = logging.getLogger(__name__)

class TTSEngineBase(ABC):
    """Base class for all TTS engines
    
    This abstract class defines the interface that all TTS engines must implement.
    It also provides common utility methods for file handling and audio generation.
    """
    
    def __init__(self, lang_code: str = 'z'):
        """Initialize the TTS engine
        
        Args:
            lang_code (str): Language code ('a' for US English, 'b' for British English,
                           'j' for Japanese, 'z' for Mandarin Chinese)
                           Note: Not all engines support all language codes
        """
        self.lang_code = lang_code
        logger.info(f"Initializing {self.__class__.__name__} with language code: {lang_code}")
    
    @abstractmethod
    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Optional[str]:
        """Generate speech from text
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.)
                         Note: Not all engines support all voices
            speed (float): Speech speed multiplier (0.5 to 2.0)
                           Note: Not all engines support speed adjustment
            
        Returns:
            Optional[str]: Path to the generated audio file, or None if generation fails
        """
        pass
    
    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech from text and yield each segment
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID to use
            speed (float): Speech speed multiplier
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        # Default implementation: generate full audio and yield as a single chunk
        output_path = self.generate_speech(text, voice, speed)
        audio_data, sample_rate = sf.read(output_path)
        yield sample_rate, audio_data
    
    def _create_output_dir(self) -> str:
        """Create output directory for audio files
        
        Returns:
            str: Path to the output directory
        """
        output_dir = "temp/outputs"
        os.makedirs(output_dir, exist_ok=True)
        return output_dir
    
    def _generate_output_path(self, prefix: str = "output") -> str:
        """Generate a unique output path for audio files
        
        Args:
            prefix (str): Prefix for the output filename
            
        Returns:
            str: Path to the output file
        """
        output_dir = self._create_output_dir()
        timestamp = int(time.time())
        return f"{output_dir}/{prefix}_{timestamp}.wav"


class DummyTTSEngine(TTSEngineBase):
    """Dummy TTS engine that generates a simple sine wave
    
    This engine is used as a fallback when no other engines are available.
    """
    
    def __init__(self, lang_code: str = 'z'):
        super().__init__(lang_code)
        logger.warning("Using dummy TTS implementation as no other engines are available")
    
    def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str:
        """Generate a dummy audio file with a simple sine wave
        
        Args:
            text (str): Input text (not used)
            voice (str): Voice ID (not used)
            speed (float): Speed multiplier (not used)
            
        Returns:
            str: Path to the generated dummy audio file
        """
        logger.info(f"Generating dummy speech for text length: {len(text)}")
        
        # Generate unique output path
        output_path = self._generate_output_path("dummy")
        
        # Generate a simple sine wave
        sample_rate = 24000
        duration = 3.0  # seconds
        t = np.linspace(0, duration, int(sample_rate * duration), False)
        tone = np.sin(2 * np.pi * 440 * t) * 0.3
        
        # Save the audio file
        logger.info(f"Saving dummy audio to {output_path}")
        sf.write(output_path, tone, sample_rate)
        logger.info(f"Dummy audio generation complete: {output_path}")
        
        return output_path
    
    def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate dummy audio chunks with simple sine waves
        
        Args:
            text (str): Input text (not used)
            voice (str): Voice ID (not used)
            speed (float): Speed multiplier (not used)
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each dummy segment
        """
        logger.info(f"Generating dummy speech stream for text length: {len(text)}")
        
        sample_rate = 24000
        duration = 1.0  # seconds per chunk
        
        # Create 3 chunks of dummy audio
        for i in range(3):
            t = np.linspace(0, duration, int(sample_rate * duration), False)
            freq = 440 + (i * 220)  # Different frequency for each chunk
            tone = np.sin(2 * np.pi * freq * t) * 0.3
            yield sample_rate, tone