Spaces:

allanctan-ai
/

be-mini-ai

Sleeping

App Files Files Community

allanctan commited on Aug 4

Commit

94dc091

1 Parent(s): ea2c226

test: for websocket interface

Browse files

Files changed (6) hide show

README.md +6 -0
__pycache__/parse.cpython-313.pyc +0 -0
all_questions_with_audio.json +0 -0
main.py +132 -2
test_start.py +36 -0
working.py +59 -0

README.md CHANGED Viewed

@@ -10,3 +10,9 @@ short_description: better-ed mini
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+curl -X POST https://allanctan-ai.hf.space/be-mini-ai/transcribe \
+  -F "file=@voice\a_projectil_is.wav"
+  allanctan-ai/be-mini-ai

__pycache__/parse.cpython-313.pyc ADDED Viewed

Binary file (2.34 kB). View file

all_questions_with_audio.json ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py CHANGED Viewed

@@ -1,15 +1,50 @@
-from fastapi import FastAPI, UploadFile, File
 from unsloth import FastVisionModel
 import torch
 import shutil
 import os
 os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torchinductor"
 app = FastAPI()
 model, processor = FastVisionModel.from_pretrained("unsloth/gemma-3n-e2b-it", load_in_4bit=True)
 model.generation_config.cache_implementation = "static"
 @app.post("/transcribe/")
 async def transcribe_audio(file: UploadFile = File(...)):
     filepath = f"/tmp/{file.filename}"
@@ -29,6 +64,101 @@ async def transcribe_audio(file: UploadFile = File(...)):
         tokenize=True, return_dict=True, return_tensors="pt"
     ).to(model.device, dtype=model.dtype)
-    outputs = model.generate(**input_ids, max_new_tokens=16)
     result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
     return {"text": result}

+from fastapi import FastAPI, UploadFile, File, WebSocket, WebSocketDisconnect
+from fastapi.middleware.cors import CORSMiddleware
 from unsloth import FastVisionModel
 import torch
 import shutil
 import os
+import json
+import base64
+import tempfile
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torchinductor"
 app = FastAPI()
+# Add CORS for WebSocket
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Load model at startup (same as your original)
 model, processor = FastVisionModel.from_pretrained("unsloth/gemma-3n-e2b-it", load_in_4bit=True)
 model.generation_config.cache_implementation = "static"
+@app.get("/")
+async def root():
+    return {"message": "API is running"}
+@app.get("/health")
+async def health_check():
+    try:
+        return {
+            "status": "healthy",
+            "model_loaded": model is not None,
+            "processor_loaded": processor is not None,
+            "device": str(model.device) if model else "none"
+        }
+    except Exception as e:
+        return {"status": "unhealthy", "error": str(e)}
 @app.post("/transcribe/")
 async def transcribe_audio(file: UploadFile = File(...)):
     filepath = f"/tmp/{file.filename}"
         tokenize=True, return_dict=True, return_tensors="pt"
     ).to(model.device, dtype=model.dtype)
+    outputs = model.generate(**input_ids, max_new_tokens=64, do_sample=False, temperature=0.1)
     result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    result = result.split("model\n")[-1].split("<end_of_turn>")[0].strip()
+    # Cleanup
+    if os.path.exists(filepath):
+        os.remove(filepath)
     return {"text": result}
+# Simple WebSocket endpoint
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket):
+    await websocket.accept()
+    logger.info("WebSocket client connected")
+    try:
+        while True:
+            # Receive message
+            data = await websocket.receive_text()
+            message = json.loads(data)
+            logger.info(f"Received message: {message}")
+            # Handle audio data
+            if "audio_data" in message:
+                audio_b64 = message["audio_data"]
+                mime_type = message.get("mime_type", "audio/wav")
+                try:
+                    # Use your exact transcribe logic
+                    transcription = await transcribe_base64_audio(audio_b64, mime_type)
+                    # Send response
+                    response = {
+                        "type": "transcription",
+                        "text": transcription
+                    }
+                    await websocket.send_text(json.dumps(response))
+                except Exception as e:
+                    logger.error(f"Transcription error: {e}")
+                    await websocket.send_text(json.dumps({
+                        "type": "error",
+                        "message": str(e)
+                    }))
+            # Handle ping/pong
+            elif message.get("type") == "ping":
+                await websocket.send_text(json.dumps({"type": "pong"}))
+            else:
+                await websocket.send_text(json.dumps({
+                    "type": "error",
+                    "message": "Unknown message format"
+                }))
+    except WebSocketDisconnect:
+        logger.info("WebSocket client disconnected")
+    except Exception as e:
+        logger.error(f"WebSocket error: {e}")
+async def transcribe_base64_audio(audio_b64: str, mime_type: str) -> str:
+    """Use your exact transcribe logic but with base64 audio data"""
+    # Convert base64 to file (same as your transcribe logic)
+    audio_data = base64.b64decode(audio_b64)
+    # Create temp file
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
+        temp_file.write(audio_data)
+        filepath = temp_file.name
+    try:
+        # Your exact transcribe logic
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "audio", "audio": filepath},
+                {"type": "text", "text": "Transcribe this audio"},
+            ]
+        }]
+        input_ids = processor.apply_chat_template(
+            messages, add_generation_prompt=True,
+            tokenize=True, return_dict=True, return_tensors="pt"
+        ).to(model.device, dtype=model.dtype)
+        outputs = model.generate(**input_ids, max_new_tokens=64, do_sample=False, temperature=0.1)
+        result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        print(result)
+        result = result.split("model\n")[-1].split("<end_of_turn>")[0].strip()
+        return result
+    finally:
+        # Cleanup temp file
+        if os.path.exists(filepath):
+            os.remove(filepath)

test_start.py ADDED Viewed

	@@ -0,0 +1,36 @@

+#!/usr/bin/env python3
+"""
+Test script for the start() function
+"""
+from parse import start
+def test_start_function():
+    """Test the start function with different speakers"""
+    print("🧪 Testing start() function with different speakers\n")
+    # Test with 'question' speaker (first level questions)
+    print("📢 Results for speaker 'question':")
+    results = start('question')
+    print(f"Found {len(results)} questions")
+    for i, result in enumerate(results[:3]):  # Show first 3
+        print(f"  {i+1}. {result['message']}")
+        print(f"     Audio: {result['audio'] if result['audio'] else 'No audio'}")
+    print("\n" + "-"*50)
+    # Test with a non-existent speaker
+    print("📢 Results for speaker 'non_existent':")
+    results = start('non_existent')
+    print(f"Found {len(results)} results")
+    print("\n" + "-"*50)
+    # Test with empty speaker
+    print("📢 Results for speaker '':")
+    results = start('')
+    print(f"Found {len(results)} results")
+if __name__ == "__main__":
+    test_start_function()

working.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from fastapi import FastAPI, UploadFile, File
+from unsloth import FastVisionModel
+import torch
+import shutil
+import os
+os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/tmp/torchinductor"
+app = FastAPI()
+model, processor = FastVisionModel.from_pretrained("unsloth/gemma-3n-e2b-it", load_in_4bit=True)
+model.generation_config.cache_implementation = "static"
+@app.get("/")
+async def root():
+    return {"message": "API is running"}
+@app.get("/health")
+async def health_check():
+    try:
+        return {
+            "status": "healthy",
+            "model_loaded": model is not None,
+            "processor_loaded": processor is not None,
+            "device": str(model.device) if model else "none"
+        }
+    except Exception as e:
+        return {"status": "unhealthy", "error": str(e)}
+@app.post("/transcribe/")
+async def transcribe_audio(file: UploadFile = File(...)):
+    filepath = f"/tmp/{file.filename}"
+    with open(filepath, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "audio", "audio": filepath},
+            {"type": "text", "text": "Transcribe this audio"},
+        ]
+    }]
+    input_ids = processor.apply_chat_template(
+        messages, add_generation_prompt=True,
+        tokenize=True, return_dict=True, return_tensors="pt"
+    ).to(model.device, dtype=model.dtype)
+    # Generate output from the model
+    outputs = model.generate(**input_ids, max_new_tokens=64, do_sample=False,
+        temperature=0.1)
+    # decode and print the output as text
+    result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+    # Extract only transcription
+    result = result.split("model\n")[-1].split("<end_of_turn>")[0].strip()
+    return {"text": result}