Spaces:

shriniket73
/

goggins-chat

Sleeping

App Files Files Community

shriniket73 commited on Dec 2, 2024

Commit

c6bc3fd

verified ·

1 Parent(s): 196cb0c

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -57

app.py CHANGED Viewed

@@ -21,24 +21,26 @@ from TTS.api import TTS
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class TTSRequest(BaseModel):
     text: str
 class OptimizedTTSService:
     def __init__(self):
         logger.info("Initializing Optimized TTS Service...")
         try:
             # Set TTS home directory and accept license
-            os.environ['HOME'] = '/tmp/home'
-            os.environ['TTS_HOME'] = '/tmp/tts_home'
             os.environ["COQUI_TOS_AGREED"] = "1"  # Accept TTS license
             # Set number of threads for PyTorch
             n_threads = max(2, multiprocessing.cpu_count() - 1)
             torch.set_num_threads(n_threads)
             logger.info(f"Using {n_threads} CPU threads")
             # Initialize TTS with error handling
             try:
                 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
@@ -48,40 +50,40 @@ class OptimizedTTSService:
             except Exception as e:
                 logger.error(f"Failed to load TTS model: {str(e)}")
                 raise
             # Load latents
             try:
                 logger.info("Loading voice latents...")
                 latents_path = "models/goggins_latents.pt"
                 if not os.path.exists(latents_path):
                     raise FileNotFoundError(f"Latents file not found at {latents_path}")
-                self.latents = torch.load(latents_path, map_location='cpu')
                 logger.info("Latents loaded successfully")
             except Exception as e:
                 logger.error(f"Failed to load latents: {str(e)}")
                 raise
             # Initialize thread pool for parallel processing
             self.executor = ThreadPoolExecutor(max_workers=n_threads)
             # Configure model for inference
             self.model = self.tts.synthesizer.tts_model
             self.model.eval()
             # Initialize device
             self.device = torch.device("cpu")
             logger.info(f"Using device: {self.device}")
             # Initialize cache
             self._setup_cache()
             logger.info("Service initialization complete!")
         except Exception as e:
             logger.error(f"Failed to initialize TTS service: {str(e)}")
             raise
     def _setup_cache(self):
         """Setup caching mechanisms with error handling"""
         try:
@@ -93,25 +95,23 @@ class OptimizedTTSService:
         except Exception as e:
             logger.error(f"Failed to setup cache: {str(e)}")
             raise
     def _process_chunk(self, chunk: str) -> np.ndarray:
         """Process a single chunk of text with improved error handling"""
         try:
             # Convert latents to tensors
             speaker_embedding = torch.tensor(
-                self.latents['speaker_embedding'],
                 dtype=torch.float32,
-                device=self.device
             )
             gpt_cond_latent = torch.tensor(
-                self.latents['gpt_cond_latent'],
-                dtype=torch.float32,
-                device=self.device
             )
             # Get optimized parameters based on chunk length
             params = self._get_params_for_length(len(chunk))
             # Generate speech
             with torch.no_grad():
                 wav = self.model.inference(
@@ -119,76 +119,77 @@ class OptimizedTTSService:
                     language="en",
                     gpt_cond_latent=gpt_cond_latent,
                     speaker_embedding=speaker_embedding,
-                    **params
                 )
             return wav["wav"]
         except Exception as e:
             logger.error(f"Error processing chunk '{chunk[:50]}...': {str(e)}")
             raise
     def _get_params_for_length(self, chunk_length: int) -> Dict:
         """Get optimized parameters based on text length"""
         if chunk_length <= 80:
             return {
-                'temperature': 0.75,
-                'length_penalty': 0.8,
-                'repetition_penalty': 1.8,
-                'top_k': 40,
-                'top_p': 0.80
             }
         elif chunk_length <= 150:
             return {
-                'temperature': 0.85,
-                'length_penalty': 1.0,
-                'repetition_penalty': 2.0,
-                'top_k': 50,
-                'top_p': 0.85
             }
         else:
             return {
-                'temperature': 0.95,
-                'length_penalty': 1.2,
-                'repetition_penalty': 2.2,
-                'top_k': 60,
-                'top_p': 0.90
             }
     def generate_speech(self, text: str) -> np.ndarray:
         """Generate speech with improved error handling"""
         try:
             # Clean and validate input
             if not text or not text.strip():
                 raise ValueError("Empty text input")
             text = text.strip()
             if len(text) > 1000:  # Add reasonable limit
                 raise ValueError("Text too long (max 1000 characters)")
             # Process single chunk for short text
             if len(text) <= 150:
                 return self._process_chunk(text)
             # Split longer text into chunks
-            chunks = text.split('. ')
-            chunks = [chunk.strip() + '.' for chunk in chunks if chunk.strip()]
             # Process chunks
             wavs = []
             for i, chunk in enumerate(chunks, 1):
                 logger.info(f"Processing chunk {i}/{len(chunks)}: {chunk[:50]}...")
                 wav = self._process_chunk(chunk)
                 wavs.append(wav)
             # Concatenate results
             final_wav = np.concatenate(wavs)
             return final_wav
         except Exception as e:
             logger.error(f"Error in generate_speech: {str(e)}")
             raise
 # Initialize FastAPI app
 app = FastAPI(title="Goggins TTS API")
@@ -204,6 +205,7 @@ app.add_middleware(
 # Initialize service
 service = None
 @app.on_event("startup")
 async def startup_event():
     global service
@@ -213,36 +215,45 @@ async def startup_event():
         logger.error(f"Failed to initialize service: {str(e)}")
         raise
 @app.post("/generate")
 async def generate_speech(request: TTSRequest):
     """Generate speech from text with detailed timing"""
     try:
         total_start = time.time()
         logger.info(f"\nReceived request for text: {request.text[:50]}...")
         # Model processing time
         model_start = time.time()
         wav = service.generate_speech(request.text)
         model_time = time.time() - model_start
         # Audio conversion time
         conversion_start = time.time()
         buffer = io.BytesIO()
         np.save(buffer, wav.astype(np.float32))
         audio_base64 = base64.b64encode(buffer.getvalue()).decode()
         conversion_time = time.time() - conversion_start
         # Total processing time
         total_time = time.time() - total_start
         timing_info = {
             "total_processing_time": round(total_time, 2),
             "model_processing_time": round(model_time, 2),
-            "audio_conversion_time": round(conversion_time, 2)
         }
         logger.info(f"Timing breakdown: {timing_info}")
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class TTSRequest(BaseModel):
     text: str
 class OptimizedTTSService:
     def __init__(self):
         logger.info("Initializing Optimized TTS Service...")
         try:
             # Set TTS home directory and accept license
+            os.environ["HOME"] = "/tmp/home"
+            os.environ["TTS_HOME"] = "/tmp/tts_home"
             os.environ["COQUI_TOS_AGREED"] = "1"  # Accept TTS license
             # Set number of threads for PyTorch
             n_threads = max(2, multiprocessing.cpu_count() - 1)
             torch.set_num_threads(n_threads)
             logger.info(f"Using {n_threads} CPU threads")
             # Initialize TTS with error handling
             try:
                 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
             except Exception as e:
                 logger.error(f"Failed to load TTS model: {str(e)}")
                 raise
             # Load latents
             try:
                 logger.info("Loading voice latents...")
                 latents_path = "models/goggins_latents.pt"
                 if not os.path.exists(latents_path):
                     raise FileNotFoundError(f"Latents file not found at {latents_path}")
+                self.latents = torch.load(latents_path, map_location="cpu")
                 logger.info("Latents loaded successfully")
             except Exception as e:
                 logger.error(f"Failed to load latents: {str(e)}")
                 raise
             # Initialize thread pool for parallel processing
             self.executor = ThreadPoolExecutor(max_workers=n_threads)
             # Configure model for inference
             self.model = self.tts.synthesizer.tts_model
             self.model.eval()
             # Initialize device
             self.device = torch.device("cpu")
             logger.info(f"Using device: {self.device}")
             # Initialize cache
             self._setup_cache()
             logger.info("Service initialization complete!")
         except Exception as e:
             logger.error(f"Failed to initialize TTS service: {str(e)}")
             raise
     def _setup_cache(self):
         """Setup caching mechanisms with error handling"""
         try:
         except Exception as e:
             logger.error(f"Failed to setup cache: {str(e)}")
             raise
     def _process_chunk(self, chunk: str) -> np.ndarray:
         """Process a single chunk of text with improved error handling"""
         try:
             # Convert latents to tensors
             speaker_embedding = torch.tensor(
+                self.latents["speaker_embedding"],
                 dtype=torch.float32,
+                device=self.device,
             )
             gpt_cond_latent = torch.tensor(
+                self.latents["gpt_cond_latent"], dtype=torch.float32, device=self.device
             )
             # Get optimized parameters based on chunk length
             params = self._get_params_for_length(len(chunk))
             # Generate speech
             with torch.no_grad():
                 wav = self.model.inference(
                     language="en",
                     gpt_cond_latent=gpt_cond_latent,
                     speaker_embedding=speaker_embedding,
+                    **params,
                 )
             return wav["wav"]
         except Exception as e:
             logger.error(f"Error processing chunk '{chunk[:50]}...': {str(e)}")
             raise
     def _get_params_for_length(self, chunk_length: int) -> Dict:
         """Get optimized parameters based on text length"""
         if chunk_length <= 80:
             return {
+                "temperature": 0.75,
+                "length_penalty": 0.8,
+                "repetition_penalty": 1.8,
+                "top_k": 40,
+                "top_p": 0.80,
             }
         elif chunk_length <= 150:
             return {
+                "temperature": 0.85,
+                "length_penalty": 1.0,
+                "repetition_penalty": 2.0,
+                "top_k": 50,
+                "top_p": 0.85,
             }
         else:
             return {
+                "temperature": 0.95,
+                "length_penalty": 1.2,
+                "repetition_penalty": 2.2,
+                "top_k": 60,
+                "top_p": 0.90,
             }
     def generate_speech(self, text: str) -> np.ndarray:
         """Generate speech with improved error handling"""
         try:
             # Clean and validate input
             if not text or not text.strip():
                 raise ValueError("Empty text input")
             text = text.strip()
             if len(text) > 1000:  # Add reasonable limit
                 raise ValueError("Text too long (max 1000 characters)")
             # Process single chunk for short text
             if len(text) <= 150:
                 return self._process_chunk(text)
             # Split longer text into chunks
+            chunks = text.split(". ")
+            chunks = [chunk.strip() + "." for chunk in chunks if chunk.strip()]
             # Process chunks
             wavs = []
             for i, chunk in enumerate(chunks, 1):
                 logger.info(f"Processing chunk {i}/{len(chunks)}: {chunk[:50]}...")
                 wav = self._process_chunk(chunk)
                 wavs.append(wav)
             # Concatenate results
             final_wav = np.concatenate(wavs)
             return final_wav
         except Exception as e:
             logger.error(f"Error in generate_speech: {str(e)}")
             raise
 # Initialize FastAPI app
 app = FastAPI(title="Goggins TTS API")
 # Initialize service
 service = None
 @app.on_event("startup")
 async def startup_event():
     global service
         logger.error(f"Failed to initialize service: {str(e)}")
         raise
 @app.post("/generate")
 async def generate_speech(request: TTSRequest):
     """Generate speech from text with detailed timing"""
     try:
         total_start = time.time()
         logger.info(f"\nReceived request for text: {request.text[:50]}...")
         # Model processing time
         model_start = time.time()
         wav = service.generate_speech(request.text)
         model_time = time.time() - model_start
         # Audio conversion time
         conversion_start = time.time()
         buffer = io.BytesIO()
         np.save(buffer, wav.astype(np.float32))
         audio_base64 = base64.b64encode(buffer.getvalue()).decode()
         conversion_time = time.time() - conversion_start
         # Total processing time
         total_time = time.time() - total_start
         timing_info = {
             "total_processing_time": round(total_time, 2),
             "model_processing_time": round(model_time, 2),
+            "audio_conversion_time": round(conversion_time, 2),
         }
         logger.info(f"Timing breakdown: {timing_info}")
+        # Add the missing return statement
+        return {"status": "success", "audio": audio_base64, "timing": timing_info}
+    except Exception as e:
+        logger.error(f"Error in generate_speech endpoint: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/health")
 async def health_check():
     """Health check endpoint"""