Spaces:

DroolingPanda
/

teachingAssistant

Running

File size: 6,677 Bytes

import logging
import numpy as np
import soundfile as sf
from typing import Optional, Generator, Tuple

from utils.tts_base import TTSBase

# Configure logging
logger = logging.getLogger(__name__)

# Flag to track CosyVoice2 availability
COSYVOICE2_AVAILABLE = False
DEFAULT_SAMPLE_RATE = 24000

# Try to import CosyVoice2 dependencies
try:
    import torch
    # Import CosyVoice2 - assuming it's installed and has a similar API to Dia
    # since they're both from nari-labs according to the GitHub link
    from cosyvoice2.model import CosyVoice2
    COSYVOICE2_AVAILABLE = True
    logger.info("CosyVoice2 TTS engine is available")
except ImportError:
    logger.warning("CosyVoice2 TTS engine is not available")
except ModuleNotFoundError as e:
    logger.warning(f"CosyVoice2 TTS engine is not available: {str(e)}")
    COSYVOICE2_AVAILABLE = False


def _get_model():
    """Lazy-load the CosyVoice2 model
    
    Returns:
        CosyVoice2 or None: The CosyVoice2 model or None if not available
    """
    if not COSYVOICE2_AVAILABLE:
        logger.warning("CosyVoice2 TTS engine is not available")
        return None
    
    try:
        import torch
        from cosyvoice2.model import CosyVoice2
        
        # Initialize the model
        model = CosyVoice2.from_pretrained()
        logger.info("CosyVoice2 model successfully loaded")
        return model
    except ImportError as e:
        logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
        return None
    except FileNotFoundError as e:
        logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
        return None
    except Exception as e:
        logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
        return None


class CosyVoice2TTS(TTSBase):
    """CosyVoice2 TTS engine implementation
    
    This engine uses the CosyVoice2 model for TTS generation.
    """
    
    def __init__(self, lang_code: str = 'z'):
        """Initialize the CosyVoice2 TTS engine
        
        Args:
            lang_code (str): Language code for the engine
        """
        super().__init__(lang_code)
        self.model = None
    
    def _ensure_model(self):
        """Ensure the model is loaded
        
        Returns:
            bool: True if model is available, False otherwise
        """
        if self.model is None:
            self.model = _get_model()
        
        return self.model is not None
    
    def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
        """Generate speech using CosyVoice2 TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID (may not be used in CosyVoice2)
            speed (float): Speech speed multiplier (may not be used in CosyVoice2)
            
        Returns:
            Optional[str]: Path to the generated audio file or None if generation fails
        """
        logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")
        
        # Check if CosyVoice2 is available
        if not COSYVOICE2_AVAILABLE:
            logger.error("CosyVoice2 TTS engine is not available")
            return None
        
        # Ensure model is loaded
        if not self._ensure_model():
            logger.error("Failed to load CosyVoice2 model")
            return None
        
        try:
            import torch
            
            # Generate unique output path
            output_path = self._generate_output_path(prefix="cosyvoice2")
            
            # Generate audio
            with torch.inference_mode():
                # Assuming CosyVoice2 has a similar API to Dia
                output_audio_np = self.model.generate(
                    text,
                    max_tokens=None,
                    cfg_scale=3.0,
                    temperature=1.3,
                    top_p=0.95,
                    use_torch_compile=False,
                    verbose=False
                )
            
            if output_audio_np is not None:
                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
                logger.info(f"CosyVoice2 audio generation complete: {output_path}")
                return output_path
            else:
                logger.error("CosyVoice2 model returned None for audio output")
                return None
                
        except Exception as e:
            logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
            return None
    
    def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
        """Generate speech stream using CosyVoice2 TTS engine
        
        Args:
            text (str): Input text to synthesize
            voice (str): Voice ID (may not be used in CosyVoice2)
            speed (float): Speech speed multiplier (may not be used in CosyVoice2)
            
        Yields:
            tuple: (sample_rate, audio_data) pairs for each segment
        """
        logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")
        
        # Check if CosyVoice2 is available
        if not COSYVOICE2_AVAILABLE:
            logger.error("CosyVoice2 TTS engine is not available")
            return
        
        # Ensure model is loaded
        if not self._ensure_model():
            logger.error("Failed to load CosyVoice2 model")
            return
        
        try:
            import torch
            
            # Generate audio
            with torch.inference_mode():
                # Assuming CosyVoice2 has a similar API to Dia
                output_audio_np = self.model.generate(
                    text,
                    max_tokens=None,
                    cfg_scale=3.0,
                    temperature=1.3,
                    top_p=0.95,
                    use_torch_compile=False,
                    verbose=False
                )
            
            if output_audio_np is not None:
                logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                yield DEFAULT_SAMPLE_RATE, output_audio_np
            else:
                logger.error("CosyVoice2 model returned None for audio output")
                return
                
        except Exception as e:
            logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
            return