Spaces:

pgits
/

stt-gpu-service-python-v4

Runtime error

Peter Michael Gits Claude commited on Sep 4, 2025

Commit

dd577b3

1 Parent(s): 393f5a7

Switch to smaller Moshiko model for T4 GPU utilization

v1.4.0 - MAJOR: Switch from full Moshi to smaller Moshiko model
1. Changed model repo from DEFAULT_REPO to kyutai/moshiko-pytorch-bf16
2. Moshiko requires ~16GB VRAM vs full Moshi ~24GB (should fit T4 Small)
3. Updated all logging and UI text to reflect Moshiko model
4. Maintains GPU utilization instead of CPU fallback
5. Smaller model optimized for consumer GPUs while maintaining quality

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +18 -16

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from fastapi.responses import JSONResponse, HTMLResponse
 import uvicorn
 # Version tracking
-VERSION = "1.3.14"
 COMMIT_SHA = "TBD"
 # Configure logging
@@ -53,42 +53,44 @@ async def load_moshi_models():
             from huggingface_hub import hf_hub_download
             from moshi.models import loaders, LMGen
-            # Load Mimi (audio codec)
-            logger.info("Loading Mimi audio codec...")
-            mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME, cache_dir='/app/hf_cache')
             mimi = loaders.get_mimi(mimi_weight, device=device)
             mimi.set_num_codebooks(8)  # Limited to 8 for Moshi
-            logger.info("✅ Mimi loaded successfully")
             # Clear cache after Mimi loading
             if device == "cuda":
                 torch.cuda.empty_cache()
                 logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
-            # Load Moshi (language model)
-            logger.info("Loading Moshi language model...")
-            moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
             # Try loading with memory-efficient settings
             try:
                 moshi = loaders.get_moshi_lm(moshi_weight, device=device)
                 lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
-                logger.info("✅ Moshi loaded successfully")
             except RuntimeError as cuda_error:
                 if "CUDA out of memory" in str(cuda_error):
-                    logger.warning(f"CUDA out of memory, trying CPU fallback: {cuda_error}")
                     # Move Mimi to CPU as well for consistency
                     mimi = loaders.get_mimi(mimi_weight, device="cpu")
                     mimi.set_num_codebooks(8)
                     device = "cpu"
                     moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
                     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
-                    logger.info("✅ Moshi loaded successfully on CPU (fallback)")
                     logger.info("✅ Mimi also moved to CPU for device consistency")
                 else:
                     raise
-            logger.info("🎉 All Moshi models loaded successfully!")
             return True
         except ImportError as import_error:
@@ -118,7 +120,7 @@ def transcribe_audio_moshi(audio_data: np.ndarray, sample_rate: int = 24000) ->
     try:
         if mimi == "mock":
             duration = len(audio_data) / sample_rate
-            return f"Mock Moshi STT: {duration:.2f}s audio at {sample_rate}Hz"
         # Ensure 24kHz audio for Moshi
         if sample_rate != 24000:
@@ -176,8 +178,8 @@ async def lifespan(app: FastAPI):
 # FastAPI app with lifespan
 app = FastAPI(
-    title="STT GPU Service Python v4 - Cache Fixed",
-    description="Real-time WebSocket STT streaming with Moshi PyTorch implementation (Cache Fixed)",
     version=VERSION,
     lifespan=lifespan
 )
@@ -190,7 +192,7 @@ async def health_check():
         "timestamp": time.time(),
         "version": VERSION,
         "commit_sha": COMMIT_SHA,
-        "message": "Moshi STT WebSocket Service - Cache directory fixed",
         "space_name": "stt-gpu-service-python-v4",
         "mimi_loaded": mimi is not None and mimi != "mock",
         "moshi_loaded": moshi is not None and moshi != "mock",

 import uvicorn
 # Version tracking
+VERSION = "1.4.0"
 COMMIT_SHA = "TBD"
 # Configure logging
             from huggingface_hub import hf_hub_download
             from moshi.models import loaders, LMGen
+            # Load Mimi (audio codec) - using smaller Moshiko model
+            logger.info("Loading Mimi audio codec for Moshiko...")
+            # Use Moshiko model repo instead of default
+            MOSHIKO_REPO = "kyutai/moshiko-pytorch-bf16"
+            mimi_weight = hf_hub_download(MOSHIKO_REPO, loaders.MIMI_NAME, cache_dir='/app/hf_cache')
             mimi = loaders.get_mimi(mimi_weight, device=device)
             mimi.set_num_codebooks(8)  # Limited to 8 for Moshi
+            logger.info("✅ Mimi loaded successfully (Moshiko variant)")
             # Clear cache after Mimi loading
             if device == "cuda":
                 torch.cuda.empty_cache()
                 logger.info(f"GPU memory after Mimi: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
+            # Load Moshiko (smaller language model)
+            logger.info("Loading Moshiko language model...")
+            moshi_weight = hf_hub_download(MOSHIKO_REPO, loaders.MOSHI_NAME, cache_dir='/app/hf_cache')
             # Try loading with memory-efficient settings
             try:
                 moshi = loaders.get_moshi_lm(moshi_weight, device=device)
                 lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
+                logger.info("✅ Moshiko loaded successfully on GPU")
             except RuntimeError as cuda_error:
                 if "CUDA out of memory" in str(cuda_error):
+                    logger.warning(f"Moshiko CUDA out of memory, trying CPU fallback: {cuda_error}")
                     # Move Mimi to CPU as well for consistency
                     mimi = loaders.get_mimi(mimi_weight, device="cpu")
                     mimi.set_num_codebooks(8)
                     device = "cpu"
                     moshi = loaders.get_moshi_lm(moshi_weight, device="cpu")
                     lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)
+                    logger.info("✅ Moshiko loaded successfully on CPU (fallback)")
                     logger.info("✅ Mimi also moved to CPU for device consistency")
                 else:
                     raise
+            logger.info("🎉 All Moshiko models loaded successfully!")
             return True
         except ImportError as import_error:
     try:
         if mimi == "mock":
             duration = len(audio_data) / sample_rate
+            return f"Mock Moshiko STT: {duration:.2f}s audio at {sample_rate}Hz"
         # Ensure 24kHz audio for Moshi
         if sample_rate != 24000:
 # FastAPI app with lifespan
 app = FastAPI(
+    title="STT GPU Service Python v4 - Moshiko Model",
+    description="Real-time WebSocket STT streaming with Moshiko PyTorch implementation (Smaller model for T4 GPU)",
     version=VERSION,
     lifespan=lifespan
 )
         "timestamp": time.time(),
         "version": VERSION,
         "commit_sha": COMMIT_SHA,
+        "message": "Moshiko STT WebSocket Service - Smaller model for T4 GPU",
         "space_name": "stt-gpu-service-python-v4",
         "mimi_loaded": mimi is not None and mimi != "mock",
         "moshi_loaded": moshi is not None and moshi != "mock",