Spaces:

abhi02072005
/

ai-foley-studio-backend

Sleeping

App Files Files Community

abhi02072005 commited on about 1 month ago

Commit

80aa632

1 Parent(s): adff71c

Add all backend files with Docker support and ffmpeg configuration

Browse files

Files changed (11) hide show

.gitignore +48 -0
Dockerfile +32 -0
agent.py +143 -0
custom_wrapper.py +55 -0
link.py +669 -0
link2.py +828 -0
qsec.py +31 -0
real.py +1572 -0
reel.py +1573 -0
requirements.txt +33 -0
sound_agent.py +198 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,48 @@

+# FFmpeg binaries (will be installed via Docker)
+ffmpeg-7.1-essentials_build/
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Environment variables
+.env
+.env.local
+# OS
+.DS_Store
+Thumbs.db
+# Temporary files
+*.tmp
+*.log
+temp/
+tmp/

Dockerfile ADDED Viewed

	@@ -0,0 +1,32 @@

+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies including ffmpeg
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application files
+COPY . .
+# Create necessary directories
+RUN mkdir -p audio temp
+# Expose port for FastAPI
+EXPOSE 8000
+# Run the FastAPI application
+CMD ["uvicorn", "link2:app", "--host", "0.0.0.0", "--port", "8000"]

agent.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.output_parsers import PydanticOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from dotenv import load_dotenv
+from custom_wrapper import OpenRouterChat
+from pydantic import BaseModel, Field
+from typing import List
+import os
+import json
+import cv2
+import base64
+from PIL import Image
+import io
+load_dotenv()
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+class AudioSuggestionOutput(BaseModel):
+    audio_suggestions: List[str] = Field(default_factory=list, description="Suggested audio names for footsteps")
+    environment_description: str = Field(description="Description of the environment and ground surface")
+    reasoning: str = Field(description="Explanation for the audio suggestions")
+llm = OpenRouterChat(
+    api_key=OPENROUTER_API_KEY,
+    model="meta-llama/llama-3.2-90b-vision-instruct",
+    temperature=0.7,
+    max_tokens=1024
+)
+parser = PydanticOutputParser(pydantic_object=AudioSuggestionOutput)
+def extract_first_frame(video_path):
+    """Extract the first frame from a video file"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise ValueError(f"Cannot open video file: {video_path}")
+        success, frame = cap.read()
+        cap.release()
+        if not success:
+            raise ValueError("Cannot read the first frame from video")
+        return frame
+    except Exception as e:
+        print(f"Error extracting first frame: {e}")
+        return None
+def image_to_base64(image):
+    """Convert OpenCV image to base64 string"""
+    try:
+        # Convert BGR to RGB
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Convert to PIL Image
+        pil_image = Image.fromarray(image_rgb)
+        # Convert to base64
+        buffered = io.BytesIO()
+        pil_image.save(buffered, format="JPEG", quality=85)
+        img_str = base64.b64encode(buffered.getvalue()).decode()
+        return img_str
+    except Exception as e:
+        print(f"Error converting image to base64: {e}")
+        return None
+prompt = ChatPromptTemplate.from_template("""
+You are an expert sound designer and environmental analyst.
+Analyze the provided image and suggest appropriate audio names for footsteps based on the environment, ground surface, and surroundings.
+Image Data: {image_data}
+Please analyze:
+1. The type of ground/surface (concrete, grass, wood, carpet, gravel, etc.)
+2. The environment (indoor, outdoor, urban, natural, etc.)
+3. Weather conditions if visible (wet, dry, snowy, etc.)
+4. Any other relevant factors that would affect footstep sounds
+5. Audio suggestion's name must be friendly for a youtube search
+6. Name without extensions
+Provide 3-5 specific, descriptive audio file name suggestions for footsteps in this environment.
+The names should be clear, concise, and follow standard audio naming conventions.
+{format_instructions}
+""")
+chain = (
+        {"image_data": RunnablePassthrough(), "format_instructions": lambda x: parser.get_format_instructions()}
+        | prompt
+        | llm
+        | parser
+)
+def analyze_image_and_suggest_audio(image_base64):
+    """Analyze the image and suggest audio names for footsteps"""
+    try:
+        result = chain.invoke(image_base64)
+        return result.dict()
+    except Exception as e:
+        print("Error during image analysis:", e)
+        return None
+def process_video_for_footstep_audio(video_path):
+    # Extract first frame from video
+    print("Extracting first frame from video...")
+    first_frame = extract_first_frame(video_path)
+    if first_frame is None:
+        return {"error": "Failed to extract first frame from video"}
+    # Convert image to base64
+    print("Converting image to base64...")
+    image_base64 = image_to_base64(first_frame)
+    if image_base64 is None:
+        return {"error": "Failed to convert image to base64"}
+    # Analyze image and get audio suggestions
+    print("Analyzing image and generating audio suggestions...")
+    result = analyze_image_and_suggest_audio(image_base64)
+    # Save results
+    if result:
+        output_file = "found_img1/gemini2.json"
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+        with open(output_file, "w") as f:
+            json.dump(result, f, indent=2)
+        print(f"Results saved to {output_file}")
+    return result['audio_suggestions'][0]

custom_wrapper.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import requests
+from typing import List, Optional
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
+from langchain_core.outputs import ChatResult, ChatGeneration
+from langchain_core.language_models import BaseChatModel
+from pydantic import BaseModel, Field
+class OpenRouterChat(BaseChatModel):
+    api_key: str = Field(...)
+    model: str = "mistralai/mistral-7b-instruct:free"
+    temperature: float = 0.7
+    @property
+    def _llm_type(self) -> str:
+        return "openrouter-chat"
+    def _format_message(self, message: BaseMessage) -> dict:
+        role = "user"
+        if isinstance(message, HumanMessage):
+            role = "user"
+        elif isinstance(message, AIMessage):
+            role = "assistant"
+        else:
+            raise ValueError(f"Unsupported message type: {type(message)}")
+        return {"role": role, "content": message.content}
+    def _generate(self, messages: List[BaseMessage], stop: Optional[List[str]] = None) -> ChatResult:
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+            "HTTP-Referer": "https://yourdomain.com",
+            "X-Title": "LangChainOpenRouterWrapper"
+        }
+        payload = {
+            "model": self.model,
+            "messages": [self._format_message(m) for m in messages],
+            "temperature": self.temperature
+        }
+        response = requests.post(
+            "https://openrouter.ai/api/v1/chat/completions",
+            headers=headers,
+            json=payload,
+        )
+        if response.status_code != 200:
+            raise Exception(f"OpenRouter API error {response.status_code}: {response.text}")
+        content = response.json()["choices"][0]["message"]["content"]
+        return ChatResult(
+            generations=[ChatGeneration(message=AIMessage(content=content))]
+        )

link.py ADDED Viewed

	@@ -0,0 +1,669 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Any
+import cv2
+import numpy as np
+import mediapipe as mp
+from pathlib import Path
+import json
+import subprocess
+import os
+import soundfile as sf
+from datetime import datetime
+import tempfile
+import pandas as pd
+import shutil
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import base64
+from io import BytesIO
+# Suppress warnings
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+import absl.logging
+absl.logging.set_verbosity(absl.logging.ERROR)
+# Mock streamlit before importing real.py
+import sys
+class MockStreamlit:
+    def __getattr__(self, name):
+        def mock_func(*args, **kwargs):
+            pass
+        return mock_func
+sys.modules['streamlit'] = MockStreamlit()
+# Import working classes and functions from real.py
+from reel import (
+    HybridFootstepDetectionPipeline,
+    PersonTracker,
+    AudioGenerator,
+    create_annotated_video,
+    merge_audio_with_video
+)
+# Import your custom modules
+from agent import process_video_for_footstep_audio
+from sound_agent import main_sound
+from qsec import extract_second_audio_librosa
+app = FastAPI(title="Footstep Detection API", version="1.0.0")
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Thread pool for CPU-intensive tasks
+executor = ThreadPoolExecutor(max_workers=4)
+# ==================== Pydantic Models ====================
+class ProcessingConfig(BaseModel):
+    sensitivity: str = "medium"
+    yolo_conf: float = 0.5
+    use_hybrid: bool = True
+    create_annotated: bool = True
+    add_audio: bool = True
+    surface_type: str = "concrete"
+class FootstepEvent(BaseModel):
+    frame: int
+    timecode: str
+    foot: str
+    event: str
+    time_seconds: float
+    confidence: float
+class ProcessingResult(BaseModel):
+    task_id: str
+    status: str
+    progress: float
+    events: Optional[List[FootstepEvent]] = None
+    total_frames: Optional[int] = None
+    fps: Optional[float] = None
+    detection_stats: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+class LiveDetectionConfig(BaseModel):
+    sensitivity: str = "medium"
+    yolo_conf: float = 0.5
+# ==================== Storage ====================
+# In-memory storage for tasks
+tasks_storage = {}
+video_storage = {}
+def get_ffmpeg_path():
+    """Get FFmpeg path"""
+    possible_paths = [
+        "ffmpeg",  # Try system ffmpeg first (Docker/Linux)
+        r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe",  # Local Windows
+        "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe",  # Relative path
+    ]
+    for path in possible_paths:
+        if path == "ffmpeg":
+            try:
+                result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
+                if result.returncode == 0:
+                    return path
+            except:
+                continue
+        else:
+            if os.path.exists(path):
+                return path
+    return None
+FFMPEG_PATH = get_ffmpeg_path()
+# ==================== API Endpoints ====================
+@app.get("/")
+async def root():
+    return {"message": "Footstep Detection API", "version": "1.0.0"}
+@app.post("/api/upload-video")
+async def upload_video(
+        file: UploadFile = File(...),
+        config: Optional[str] = None
+):
+    """Upload video and create task"""
+    if not file.content_type.startswith('video/'):
+        raise HTTPException(status_code=400, detail="File must be a video")
+    # Generate task ID
+    task_id = f"task_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.urandom(4).hex()}"
+    # Save video to temp file
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+    content = await file.read()
+    temp_file.write(content)
+    temp_file.close()
+    # Parse config
+    if config:
+        try:
+            config_dict = json.loads(config)
+        except:
+            config_dict = {}
+    else:
+        config_dict = {}
+    processing_config = ProcessingConfig(**config_dict)
+    # Create task
+    tasks_storage[task_id] = {
+        'task_id': task_id,
+        'status': 'uploaded',
+        'progress': 0.0,
+        'video_path': temp_file.name,
+        'config': processing_config.dict(),
+        'created_at': datetime.now().isoformat()
+    }
+    return {
+        "task_id": task_id,
+        "status": "uploaded",
+        "message": "Video uploaded successfully"
+    }
+@app.post("/api/process/{task_id}")
+async def process_video(task_id: str, background_tasks: BackgroundTasks):
+    """Start processing video"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] == 'processing':
+        return {"message": "Task is already being processed"}
+    task['status'] = 'processing'
+    task['progress'] = 0.0
+    background_tasks.add_task(process_video_task, task_id)
+    return {
+        "task_id": task_id,
+        "status": "processing",
+        "message": "Video processing started"
+    }
+def process_video_task(task_id: str):
+    """Background task for video processing"""
+    try:
+        task = tasks_storage[task_id]
+        config = task['config']
+        video_path = task['video_path']
+        # Get video info first
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        cap.release()
+        # Initialize pipeline using real.py's class
+        pipeline = HybridFootstepDetectionPipeline(
+            fps=fps,
+            sensitivity=config['sensitivity'],
+            yolo_conf=config['yolo_conf']
+        )
+        # Process video using real.py's method
+        def progress_callback(progress):
+            task['progress'] = progress
+        results = pipeline.process_video(video_path, progress_callback)
+        # Update task
+        task['status'] = 'completed'
+        task['progress'] = 1.0
+        task['results'] = results
+        task['completed_at'] = datetime.now().isoformat()
+    except Exception as e:
+        task['status'] = 'failed'
+        task['error'] = str(e)
+        task['failed_at'] = datetime.now().isoformat()
+@app.get("/api/status/{task_id}")
+async def get_task_status(task_id: str):
+    """Get task status and progress"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    response = {
+        "task_id": task_id,
+        "status": task['status'],
+        "progress": task['progress']
+    }
+    if task['status'] == 'completed' and 'results' in task:
+        response['results'] = task['results']
+    elif task['status'] == 'failed':
+        response['error'] = task.get('error')
+    return response
+@app.post("/api/generate-video/{task_id}")
+async def generate_video(task_id: str, background_tasks: BackgroundTasks):
+    """Generate annotated video"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] != 'completed':
+        raise HTTPException(status_code=400, detail="Processing not completed")
+    if not task.get('results'):
+        raise HTTPException(status_code=400, detail="No results available")
+    background_tasks.add_task(generate_video_task, task_id)
+    return {
+        "task_id": task_id,
+        "message": "Video generation started"
+    }
+def generate_video_task(task_id: str):
+    """Background task for video generation using real.py's create_annotated_video"""
+    try:
+        print(f"[DEBUG] Starting video generation for {task_id}")
+        task = tasks_storage[task_id]
+        results = task['results']
+        video_path = task['video_path']
+        config = task['config']
+        task['video_generating'] = True
+        task['video_ready'] = False
+        print(f"[DEBUG] Creating annotated video for {task_id}")
+        # Generate output path
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='_annotated.mp4')
+        annotated_path = temp_file.name
+        temp_file.close()
+        print(f"[DEBUG] Output video path: {annotated_path}")
+        print(f"[DEBUG] Input video path: {video_path}")
+        # Use real.py's create_annotated_video function
+        def progress_callback(progress):
+            task['video_progress'] = progress
+            if int(progress * 100) % 10 == 0:
+                print(f"[DEBUG] Video generation progress: {progress * 100:.1f}%")
+        success = create_annotated_video(
+            input_path=video_path,
+            events=results['events'],
+            output_path=annotated_path,
+            use_hybrid=config.get('use_hybrid', True),
+            progress_callback=progress_callback
+        )
+        if not success:
+            raise Exception("Video annotation failed")
+        # Verify the file was created
+        if not os.path.exists(annotated_path):
+            raise Exception(f"Annotated video file was not created at {annotated_path}")
+        file_size = os.path.getsize(annotated_path)
+        print(f"[DEBUG] Annotated video file size: {file_size} bytes")
+        if file_size == 0:
+            raise Exception("Annotated video file is empty")
+        # Update task
+        task['annotated_video'] = annotated_path
+        task['video_ready'] = True
+        task['video_generating'] = False
+        task['video_progress'] = 1.0
+        print(f"[DEBUG] Video generation completed for {task_id}")
+        print(f"[DEBUG] Video file exists: {os.path.exists(annotated_path)}")
+    except Exception as e:
+        print(f"[ERROR] Video generation failed for {task_id}: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        task['video_error'] = str(e)
+        task['video_ready'] = False
+        task['video_generating'] = False
+@app.get("/api/video-status/{task_id}")
+async def get_video_status(task_id: str):
+    """Check if video is ready for download"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    return {
+        "task_id": task_id,
+        "video_ready": task.get('video_ready', False),
+        "video_generating": task.get('video_generating', False),
+        "video_progress": task.get('video_progress', 0.0),
+        "video_error": task.get('video_error', None)
+    }
+@app.get("/api/download-video/{task_id}")
+async def download_video(task_id: str):
+    """Download annotated video"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    print(f"[DEBUG] Download request for {task_id}")
+    print(f"[DEBUG] Video ready: {task.get('video_ready')}")
+    print(f"[DEBUG] Annotated video path: {task.get('annotated_video')}")
+    if not task.get('video_ready'):
+        raise HTTPException(status_code=400, detail="Video not ready")
+    video_path = task.get('annotated_video')
+    if not video_path:
+        raise HTTPException(status_code=404, detail="Video path not set")
+    if not os.path.exists(video_path):
+        raise HTTPException(status_code=404, detail=f"Video file not found at {video_path}")
+    return FileResponse(
+        video_path,
+        media_type="video/mp4",
+        filename=f"annotated_{task_id}.mp4"
+    )
+@app.get("/api/export-csv/{task_id}")
+async def export_csv(task_id: str):
+    """Export results as CSV"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] != 'completed' or 'results' not in task:
+        raise HTTPException(status_code=400, detail="No results available")
+    events = task['results']['events']
+    df = pd.DataFrame(events)
+    csv_buffer = BytesIO()
+    df.to_csv(csv_buffer, index=False)
+    csv_buffer.seek(0)
+    return StreamingResponse(
+        csv_buffer,
+        media_type="text/csv",
+        headers={"Content-Disposition": f"attachment; filename=footsteps_{task_id}.csv"}
+    )
+@app.get("/api/export-json/{task_id}")
+async def export_json(task_id: str):
+    """Export results as JSON"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] != 'completed' or 'results' not in task:
+        raise HTTPException(status_code=400, detail="No results available")
+    return JSONResponse(content=task['results'])
+@app.post("/api/generate-audio-video/{task_id}")
+async def generate_audio_video(task_id: str, background_tasks: BackgroundTasks):
+    """Generate annotated video with footstep audio"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] != 'completed':
+        raise HTTPException(status_code=400, detail="Processing not completed")
+    if not task.get('results'):
+        raise HTTPException(status_code=400, detail="No results available")
+    background_tasks.add_task(generate_audio_video_task, task_id)
+    return {
+        "task_id": task_id,
+        "message": "Audio video generation started"
+    }
+def generate_audio_video_task(task_id: str):
+    """Background task for generating video with audio using real.py's functions"""
+    try:
+        print(f"[DEBUG] Starting audio video generation for {task_id}")
+        task = tasks_storage[task_id]
+        results = task['results']
+        video_path = task['video_path']
+        config = task['config']
+        task['audio_video_generating'] = True
+        task['audio_video_ready'] = False
+        # Step 1: Generate audio track
+        print(f"[DEBUG] Generating audio track...")
+        audio_gen = AudioGenerator()
+        # Get audio file for surface type
+        '''surface_type = config.get('surface_type', 'concrete')
+        aud_name = process_video_for_footstep_audio(str(video_path))
+        aud_path = main_sound(aud_name)
+        aud_path = aud_path['default'].replace(".%(ext)s", ".mp3")'''
+        aud_path="audio/Footsteps on Gravel Path Outdoor.mp3"
+        duration = results['total_frames'] / results['fps']
+        audio_track = audio_gen.create_audio_track(
+            results['events'],
+            aud_path,
+            duration
+        )
+        task['audio_video_progress'] = 0.3
+        # Step 2: Create annotated video
+        print(f"[DEBUG] Creating annotated video...")
+        temp_video = tempfile.NamedTemporaryFile(delete=False, suffix='_temp.mp4')
+        temp_video_path = temp_video.name
+        temp_video.close()
+        def video_progress(progress):
+            task['audio_video_progress'] = 0.3 + (progress * 0.4)  # 30-70%
+        success = create_annotated_video(
+            input_path=video_path,
+            events=results['events'],
+            output_path=temp_video_path,
+            use_hybrid=config.get('use_hybrid', True),
+            progress_callback=video_progress
+        )
+        if not success:
+            raise Exception("Video annotation failed")
+        task['audio_video_progress'] = 0.7
+        # Step 3: Merge audio with video
+        print(f"[DEBUG] Merging audio with video...")
+        final_output = tempfile.NamedTemporaryFile(delete=False, suffix='_final.mp4')
+        final_output_path = final_output.name
+        final_output.close()
+        merge_success = merge_audio_with_video(
+            temp_video_path,
+            audio_track,
+            44100,
+            final_output_path
+        )
+        if not merge_success:
+            raise Exception("Audio merge failed")
+        # Cleanup temp video
+        if os.path.exists(temp_video_path):
+            os.remove(temp_video_path)
+        # Verify final file
+        if not os.path.exists(final_output_path):
+            raise Exception(f"Final video file was not created at {final_output_path}")
+        file_size = os.path.getsize(final_output_path)
+        print(f"[DEBUG] Final video file size: {file_size} bytes")
+        if file_size == 0:
+            raise Exception("Final video file is empty")
+        # Update task
+        task['audio_video_path'] = final_output_path
+        task['audio_video_ready'] = True
+        task['audio_video_generating'] = False
+        task['audio_video_progress'] = 1.0
+        print(f"[DEBUG] Audio video generation completed for {task_id}")
+    except Exception as e:
+        print(f"[ERROR] Audio video generation failed for {task_id}: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        task['audio_video_error'] = str(e)
+        task['audio_video_ready'] = False
+        task['audio_video_generating'] = False
+@app.get("/api/audio-video-status/{task_id}")
+async def get_audio_video_status(task_id: str):
+    """Check if audio video is ready for download"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    return {
+        "task_id": task_id,
+        "audio_video_ready": task.get('audio_video_ready', False),
+        "audio_video_generating": task.get('audio_video_generating', False),
+        "audio_video_progress": task.get('audio_video_progress', 0.0),
+        "audio_video_error": task.get('audio_video_error', None)
+    }
+@app.get("/api/download-audio-video/{task_id}")
+async def download_audio_video(task_id: str):
+    """Download video with audio"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if not task.get('audio_video_ready'):
+        raise HTTPException(status_code=400, detail="Audio video not ready")
+    video_path = task.get('audio_video_path')
+    if not video_path:
+        raise HTTPException(status_code=404, detail="Video path not set")
+    if not os.path.exists(video_path):
+        raise HTTPException(status_code=404, detail=f"Video file not found at {video_path}")
+    return FileResponse(
+        video_path,
+        media_type="video/mp4",
+        filename=f"footsteps_with_audio_{task_id}.mp4"
+    )
+@app.post("/api/live/capture-floor")
+async def capture_floor_frame(file: UploadFile = File(...)):
+    """Capture floor frame for live mode"""
+    if not file.content_type.startswith('image/'):
+        raise HTTPException(status_code=400, detail="File must be an image")
+    session_id = f"live_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.urandom(4).hex()}"
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
+    content = await file.read()
+    temp_file.write(content)
+    temp_file.close()
+    tasks_storage[session_id] = {
+        'type': 'live',
+        'floor_frame': temp_file.name,
+        'created_at': datetime.now().isoformat()
+    }
+    return {
+        "session_id": session_id,
+        "message": "Floor frame captured"
+    }
+@app.post("/api/live/detect-frame/{session_id}")
+async def detect_frame(session_id: str, file: UploadFile = File(...)):
+    """Detect footsteps in a single frame"""
+    if session_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Session not found")
+    if not file.content_type.startswith('image/'):
+        raise HTTPException(status_code=400, detail="File must be an image")
+    # Read frame
+    content = await file.read()
+    nparr = np.frombuffer(content, np.uint8)
+    frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    # TODO: Implement real-time detection
+    # This would use the LiveFootstepDetector class from real.py
+    return {
+        "session_id": session_id,
+        "detected": False,
+        "message": "Frame processed"
+    }
+'''
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)'''

link2.py ADDED Viewed

	@@ -0,0 +1,828 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Any
+import cv2
+import numpy as np
+import mediapipe as mp
+from pathlib import Path
+import json
+import subprocess
+import os
+import soundfile as sf
+from datetime import datetime
+import tempfile
+import pandas as pd
+import shutil
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import base64
+from io import BytesIO
+# Suppress warnings
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+import absl.logging
+absl.logging.set_verbosity(absl.logging.ERROR)
+# Mock streamlit before importing real.py
+import sys
+class MockStreamlit:
+    def __getattr__(self, name):
+        def mock_func(*args, **kwargs):
+            pass
+        return mock_func
+sys.modules['streamlit'] = MockStreamlit()
+# Import working classes and functions from real.py
+from real import (
+    HybridFootstepDetectionPipeline,
+    PersonTracker,
+    AudioGenerator,
+    LiveFootstepDetector,
+    create_annotated_video,
+    merge_audio_with_video
+)
+# Import your custom modules
+from agent import process_video_for_footstep_audio
+from sound_agent import main_sound
+from qsec import extract_second_audio_librosa
+app = FastAPI(title="Footstep Detection API", version="1.0.0")
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Thread pool for CPU-intensive tasks
+executor = ThreadPoolExecutor(max_workers=4)
+# ==================== Pydantic Models ====================
+class ProcessingConfig(BaseModel):
+    sensitivity: str = "medium"
+    yolo_conf: float = 0.5
+    use_hybrid: bool = True
+    create_annotated: bool = True
+    add_audio: bool = True
+    surface_type: str = "concrete"
+class FootstepEvent(BaseModel):
+    frame: int
+    timecode: str
+    foot: str
+    event: str
+    time_seconds: float
+    confidence: float
+class ProcessingResult(BaseModel):
+    task_id: str
+    status: str
+    progress: float
+    events: Optional[List[FootstepEvent]] = None
+    total_frames: Optional[int] = None
+    fps: Optional[float] = None
+    detection_stats: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+class LiveDetectionConfig(BaseModel):
+    sensitivity: str = "medium"
+    yolo_conf: float = 0.5
+# ==================== Storage ====================
+# In-memory storage for tasks
+tasks_storage = {}
+video_storage = {}
+def get_ffmpeg_path():
+    """Get FFmpeg path"""
+    possible_paths = [
+        "ffmpeg",  # Try system ffmpeg first (Docker/Linux)
+        r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe",  # Local Windows
+        "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe",  # Relative path
+    ]
+    for path in possible_paths:
+        if path == "ffmpeg":
+            try:
+                result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
+                if result.returncode == 0:
+                    return path
+            except:
+                continue
+        else:
+            if os.path.exists(path):
+                return path
+    return None
+FFMPEG_PATH = get_ffmpeg_path()
+# ==================== API Endpoints ====================
+@app.get("/")
+async def root():
+    return {"message": "Footstep Detection API", "version": "1.0.0"}
+@app.post("/api/upload-video")
+async def upload_video(
+        file: UploadFile = File(...),
+        config: Optional[str] = None
+):
+    """Upload video and create task"""
+    if not file.content_type.startswith('video/'):
+        raise HTTPException(status_code=400, detail="File must be a video")
+    # Generate task ID
+    task_id = f"task_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.urandom(4).hex()}"
+    # Save video to temp file
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
+    content = await file.read()
+    temp_file.write(content)
+    temp_file.close()
+    # Parse config
+    if config:
+        try:
+            config_dict = json.loads(config)
+        except:
+            config_dict = {}
+    else:
+        config_dict = {}
+    processing_config = ProcessingConfig(**config_dict)
+    # Create task
+    tasks_storage[task_id] = {
+        'task_id': task_id,
+        'status': 'uploaded',
+        'progress': 0.0,
+        'video_path': temp_file.name,
+        'config': processing_config.dict(),
+        'created_at': datetime.now().isoformat()
+    }
+    return {
+        "task_id": task_id,
+        "status": "uploaded",
+        "message": "Video uploaded successfully"
+    }
+@app.post("/api/process/{task_id}")
+async def process_video(task_id: str, background_tasks: BackgroundTasks):
+    """Start processing video"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] == 'processing':
+        return {"message": "Task is already being processed"}
+    task['status'] = 'processing'
+    task['progress'] = 0.0
+    background_tasks.add_task(process_video_task, task_id)
+    return {
+        "task_id": task_id,
+        "status": "processing",
+        "message": "Video processing started"
+    }
+def process_video_task(task_id: str):
+    """Background task for video processing"""
+    try:
+        task = tasks_storage[task_id]
+        config = task['config']
+        video_path = task['video_path']
+        # Get video info first
+        cap = cv2.VideoCapture(video_path)
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        cap.release()
+        # Initialize pipeline using real.py's class
+        pipeline = HybridFootstepDetectionPipeline(
+            fps=fps,
+            sensitivity=config['sensitivity'],
+            yolo_conf=config['yolo_conf']
+        )
+        # Process video using real.py's method
+        def progress_callback(progress):
+            task['progress'] = progress
+        results = pipeline.process_video(video_path, progress_callback)
+        # Update task
+        task['status'] = 'completed'
+        task['progress'] = 1.0
+        task['results'] = results
+        task['completed_at'] = datetime.now().isoformat()
+    except Exception as e:
+        task['status'] = 'failed'
+        task['error'] = str(e)
+        task['failed_at'] = datetime.now().isoformat()
+@app.get("/api/status/{task_id}")
+async def get_task_status(task_id: str):
+    """Get task status and progress"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    response = {
+        "task_id": task_id,
+        "status": task['status'],
+        "progress": task['progress']
+    }
+    if task['status'] == 'completed' and 'results' in task:
+        response['results'] = task['results']
+    elif task['status'] == 'failed':
+        response['error'] = task.get('error')
+    return response
+@app.post("/api/generate-video/{task_id}")
+async def generate_video(task_id: str, background_tasks: BackgroundTasks):
+    """Generate annotated video"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] != 'completed':
+        raise HTTPException(status_code=400, detail="Processing not completed")
+    if not task.get('results'):
+        raise HTTPException(status_code=400, detail="No results available")
+    background_tasks.add_task(generate_video_task, task_id)
+    return {
+        "task_id": task_id,
+        "message": "Video generation started"
+    }
+def generate_video_task(task_id: str):
+    """Background task for video generation using real.py's create_annotated_video"""
+    try:
+        print(f"[DEBUG] Starting video generation for {task_id}")
+        task = tasks_storage[task_id]
+        results = task['results']
+        video_path = task['video_path']
+        config = task['config']
+        task['video_generating'] = True
+        task['video_ready'] = False
+        print(f"[DEBUG] Creating annotated video for {task_id}")
+        # Generate output path
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='_annotated.mp4')
+        annotated_path = temp_file.name
+        temp_file.close()
+        print(f"[DEBUG] Output video path: {annotated_path}")
+        print(f"[DEBUG] Input video path: {video_path}")
+        # Use real.py's create_annotated_video function
+        def progress_callback(progress):
+            task['video_progress'] = progress
+            if int(progress * 100) % 10 == 0:
+                print(f"[DEBUG] Video generation progress: {progress * 100:.1f}%")
+        success = create_annotated_video(
+            input_path=video_path,
+            events=results['events'],
+            output_path=annotated_path,
+            use_hybrid=config.get('use_hybrid', True),
+            progress_callback=progress_callback
+        )
+        if not success:
+            raise Exception("Video annotation failed")
+        # Verify the file was created
+        if not os.path.exists(annotated_path):
+            raise Exception(f"Annotated video file was not created at {annotated_path}")
+        file_size = os.path.getsize(annotated_path)
+        print(f"[DEBUG] Annotated video file size: {file_size} bytes")
+        if file_size == 0:
+            raise Exception("Annotated video file is empty")
+        # Update task
+        task['annotated_video'] = annotated_path
+        task['video_ready'] = True
+        task['video_generating'] = False
+        task['video_progress'] = 1.0
+        print(f"[DEBUG] Video generation completed for {task_id}")
+        print(f"[DEBUG] Video file exists: {os.path.exists(annotated_path)}")
+    except Exception as e:
+        print(f"[ERROR] Video generation failed for {task_id}: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        task['video_error'] = str(e)
+        task['video_ready'] = False
+        task['video_generating'] = False
+@app.get("/api/video-status/{task_id}")
+async def get_video_status(task_id: str):
+    """Check if video is ready for download"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    return {
+        "task_id": task_id,
+        "video_ready": task.get('video_ready', False),
+        "video_generating": task.get('video_generating', False),
+        "video_progress": task.get('video_progress', 0.0),
+        "video_error": task.get('video_error', None)
+    }
+@app.get("/api/download-video/{task_id}")
+async def download_video(task_id: str):
+    """Download annotated video"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    print(f"[DEBUG] Download request for {task_id}")
+    print(f"[DEBUG] Video ready: {task.get('video_ready')}")
+    print(f"[DEBUG] Annotated video path: {task.get('annotated_video')}")
+    if not task.get('video_ready'):
+        raise HTTPException(status_code=400, detail="Video not ready")
+    video_path = task.get('annotated_video')
+    if not video_path:
+        raise HTTPException(status_code=404, detail="Video path not set")
+    if not os.path.exists(video_path):
+        raise HTTPException(status_code=404, detail=f"Video file not found at {video_path}")
+    return FileResponse(
+        video_path,
+        media_type="video/mp4",
+        filename=f"annotated_{task_id}.mp4"
+    )
+@app.get("/api/export-csv/{task_id}")
+async def export_csv(task_id: str):
+    """Export results as CSV"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] != 'completed' or 'results' not in task:
+        raise HTTPException(status_code=400, detail="No results available")
+    events = task['results']['events']
+    df = pd.DataFrame(events)
+    csv_buffer = BytesIO()
+    df.to_csv(csv_buffer, index=False)
+    csv_buffer.seek(0)
+    return StreamingResponse(
+        csv_buffer,
+        media_type="text/csv",
+        headers={"Content-Disposition": f"attachment; filename=footsteps_{task_id}.csv"}
+    )
+@app.get("/api/export-json/{task_id}")
+async def export_json(task_id: str):
+    """Export results as JSON"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] != 'completed' or 'results' not in task:
+        raise HTTPException(status_code=400, detail="No results available")
+    return JSONResponse(content=task['results'])
+@app.post("/api/generate-audio-video/{task_id}")
+async def generate_audio_video(task_id: str, background_tasks: BackgroundTasks):
+    """Generate annotated video with footstep audio"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if task['status'] != 'completed':
+        raise HTTPException(status_code=400, detail="Processing not completed")
+    if not task.get('results'):
+        raise HTTPException(status_code=400, detail="No results available")
+    background_tasks.add_task(generate_audio_video_task, task_id)
+    return {
+        "task_id": task_id,
+        "message": "Audio video generation started"
+    }
+def generate_audio_video_task(task_id: str):
+    """Background task for generating video with audio using real.py's functions"""
+    try:
+        print(f"[DEBUG] Starting audio video generation for {task_id}")
+        task = tasks_storage[task_id]
+        results = task['results']
+        video_path = task['video_path']
+        config = task['config']
+        task['audio_video_generating'] = True
+        task['audio_video_ready'] = False
+        # Step 1: Generate audio track
+        print(f"[DEBUG] Generating audio track...")
+        audio_gen = AudioGenerator()
+        # Get audio file for surface type
+        surface_type = config.get('surface_type', 'concrete')
+        '''aud_name = process_video_for_footstep_audio(str(video_path))
+        aud_path = main_sound(aud_name)
+        aud_path = aud_path['default'].replace(".%(ext)s", ".mp3")'''
+        aud_path = "audio/Footsteps on Gravel Path Outdoor.mp3"
+        duration = results['total_frames'] / results['fps']
+        audio_track = audio_gen.create_audio_track(
+            results['events'],
+            aud_path,
+            duration
+        )
+        task['audio_video_progress'] = 0.3
+        # Step 2: Create annotated video
+        print(f"[DEBUG] Creating annotated video...")
+        temp_video = tempfile.NamedTemporaryFile(delete=False, suffix='_temp.mp4')
+        temp_video_path = temp_video.name
+        temp_video.close()
+        def video_progress(progress):
+            task['audio_video_progress'] = 0.3 + (progress * 0.4)  # 30-70%
+        success = create_annotated_video(
+            input_path=video_path,
+            events=results['events'],
+            output_path=temp_video_path,
+            use_hybrid=config.get('use_hybrid', True),
+            progress_callback=video_progress
+        )
+        if not success:
+            raise Exception("Video annotation failed")
+        task['audio_video_progress'] = 0.7
+        # Step 3: Merge audio with video
+        print(f"[DEBUG] Merging audio with video...")
+        final_output = tempfile.NamedTemporaryFile(delete=False, suffix='_final.mp4')
+        final_output_path = final_output.name
+        final_output.close()
+        merge_success = merge_audio_with_video(
+            temp_video_path,
+            audio_track,
+            44100,
+            final_output_path
+        )
+        if not merge_success:
+            raise Exception("Audio merge failed")
+        # Cleanup temp video
+        if os.path.exists(temp_video_path):
+            os.remove(temp_video_path)
+        # Verify final file
+        if not os.path.exists(final_output_path):
+            raise Exception(f"Final video file was not created at {final_output_path}")
+        file_size = os.path.getsize(final_output_path)
+        print(f"[DEBUG] Final video file size: {file_size} bytes")
+        if file_size == 0:
+            raise Exception("Final video file is empty")
+        # Update task
+        task['audio_video_path'] = final_output_path
+        task['audio_video_ready'] = True
+        task['audio_video_generating'] = False
+        task['audio_video_progress'] = 1.0
+        print(f"[DEBUG] Audio video generation completed for {task_id}")
+    except Exception as e:
+        print(f"[ERROR] Audio video generation failed for {task_id}: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        task['audio_video_error'] = str(e)
+        task['audio_video_ready'] = False
+        task['audio_video_generating'] = False
+@app.get("/api/audio-video-status/{task_id}")
+async def get_audio_video_status(task_id: str):
+    """Check if audio video is ready for download"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    return {
+        "task_id": task_id,
+        "audio_video_ready": task.get('audio_video_ready', False),
+        "audio_video_generating": task.get('audio_video_generating', False),
+        "audio_video_progress": task.get('audio_video_progress', 0.0),
+        "audio_video_error": task.get('audio_video_error', None)
+    }
+@app.get("/api/download-audio-video/{task_id}")
+async def download_audio_video(task_id: str):
+    """Download video with audio"""
+    if task_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Task not found")
+    task = tasks_storage[task_id]
+    if not task.get('audio_video_ready'):
+        raise HTTPException(status_code=400, detail="Audio video not ready")
+    video_path = task.get('audio_video_path')
+    if not video_path:
+        raise HTTPException(status_code=404, detail="Video path not set")
+    if not os.path.exists(video_path):
+        raise HTTPException(status_code=404, detail=f"Video file not found at {video_path}")
+    return FileResponse(
+        video_path,
+        media_type="video/mp4",
+        filename=f"footsteps_with_audio_{task_id}.mp4"
+    )
+@app.post("/api/live/capture-floor")
+async def capture_floor_frame(file: UploadFile = File(...)):
+    """Capture floor frame for live mode"""
+    if not file.content_type.startswith('image/'):
+        raise HTTPException(status_code=400, detail="File must be an image")
+    session_id = f"live_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.urandom(4).hex()}"
+    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
+    content = await file.read()
+    temp_file.write(content)
+    temp_file.close()
+    tasks_storage[session_id] = {
+        'type': 'live',
+        'floor_frame': temp_file.name,
+        'created_at': datetime.now().isoformat()
+    }
+    return {
+        "session_id": session_id,
+        "message": "Floor frame captured"
+    }
+@app.post("/api/live/detect-frame/{session_id}")
+async def detect_frame(session_id: str, file: UploadFile = File(...)):
+    """Detect footsteps in a single frame using LiveFootstepDetector"""
+    if session_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Session not found")
+    if not file.content_type.startswith('image/'):
+        raise HTTPException(status_code=400, detail="File must be an image")
+    session = tasks_storage[session_id]
+    # Read frame
+    content = await file.read()
+    nparr = np.frombuffer(content, np.uint8)
+    frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    if frame is None:
+        raise HTTPException(status_code=400, detail="Failed to decode frame")
+    # Initialize detector if not already done
+    if 'detector' not in session:
+        try:
+            # Get audio path from session or use default
+            audio_path = session.get('audio_path', 'backend/audio/UrbanFootstepsConcrete.mp3')
+            sensitivity = session.get('sensitivity', 'medium')
+            yolo_conf = session.get('yolo_conf', 0.5)
+            # Check if audio file exists
+            if not os.path.exists(audio_path):
+                # Try alternative paths
+                alt_paths = [
+                    'audio/UrbanFootstepsConcrete.mp3',
+                    'backend/audio/Footsteps on Gravel Path Outdoor.mp3',
+                    'audio/Footsteps on Gravel Path Outdoor.mp3'
+                ]
+                audio_found = False
+                for alt_path in alt_paths:
+                    if os.path.exists(alt_path):
+                        audio_path = alt_path
+                        audio_found = True
+                        break
+                if not audio_found:
+                    raise HTTPException(status_code=404,
+                                        detail=f"Audio file not found. Searched paths: {audio_path}, {alt_paths}")
+            # Create detector instance
+            detector = LiveFootstepDetector(
+                audio_path=audio_path,
+                sensitivity=sensitivity,
+                yolo_conf=yolo_conf
+            )
+            detector.start()  # Start audio playback thread
+            session['detector'] = detector
+            session['detector_started'] = datetime.now().isoformat()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Failed to initialize detector: {str(e)}")
+    detector = session['detector']
+    # Process frame with detector
+    try:
+        processed_frame, detected_foot = detector.process_frame(frame)
+        # Encode processed frame back to JPEG
+        _, buffer = cv2.imencode('.jpg', processed_frame)
+        frame_base64 = base64.b64encode(buffer).decode('utf-8')
+        response = {
+            "session_id": session_id,
+            "detected": detected_foot is not None,
+            "foot": detected_foot,  # 'LEFT', 'RIGHT', or None
+            "frame": frame_base64,  # Processed frame with annotations
+            "message": f"{detected_foot} STRIKE!" if detected_foot else "Frame processed"
+        }
+        # Update session stats
+        if 'detection_count' not in session:
+            session['detection_count'] = 0
+        if detected_foot:
+            session['detection_count'] += 1
+            session['last_detection'] = {
+                'foot': detected_foot,
+                'timestamp': datetime.now().isoformat()
+            }
+        return response
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Frame processing error: {str(e)}")
+@app.post("/api/live/generate-audio/{session_id}")
+async def generate_audio(session_id: str):
+    """Generate audio for live detection based on floor analysis"""
+    if session_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Session not found")
+    session = tasks_storage[session_id]
+    if 'floor_frame' not in session:
+        raise HTTPException(status_code=400, detail="No floor frame found")
+    # For now, we'll use a default audio path based on common floor types
+    # In a real implementation, this could use LLM vision to analyze the floor
+    # and select the appropriate audio file
+    # Default audio paths to try
+    audio_paths = [
+        'audio/Footsteps on Gravel Path Outdoor.mp3'
+    ]
+    audio_path = None
+    for path in audio_paths:
+        if os.path.exists(path):
+            audio_path = path
+            break
+    if not audio_path:
+        raise HTTPException(
+            status_code=404,
+            detail=f"No audio file found. Please ensure audio files exist in backend/audio/ directory. Searched: {audio_paths}"
+        )
+    # Store audio path in session for later use
+    session['audio_path'] = audio_path
+    session['audio_ready'] = True
+    session['surface_type'] = 'concrete'  # Default, could be enhanced with LLM analysis
+    return {
+        "session_id": session_id,
+        "message": "Audio generated successfully",
+        "surface_type": session['surface_type'],
+        "audio_ready": True
+    }
+@app.post("/api/live/stop-session/{session_id}")
+async def stop_live_session(session_id: str):
+    """Stop live detection session and cleanup resources"""
+    if session_id not in tasks_storage:
+        raise HTTPException(status_code=404, detail="Session not found")
+    session = tasks_storage[session_id]
+    # Stop detector if exists
+    if 'detector' in session:
+        try:
+            detector = session['detector']
+            detector.stop()
+            del session['detector']
+        except Exception as e:
+            print(f"Error stopping detector: {e}")
+    # Cleanup floor frame
+    if 'floor_frame' in session:
+        try:
+            if os.path.exists(session['floor_frame']):
+                os.remove(session['floor_frame'])
+        except Exception as e:
+            print(f"Error removing floor frame: {e}")
+    # Get stats before deletion
+    detection_count = session.get('detection_count', 0)
+    last_detection = session.get('last_detection', None)
+    # Remove session
+    del tasks_storage[session_id]
+    return {
+        "session_id": session_id,
+        "message": "Session stopped",
+        "stats": {
+            "detection_count": detection_count,
+            "last_detection": last_detection
+        }
+    }
+'''if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+'''

qsec.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import numpy as np
+import librosa
+def extract_second_audio_librosa(file_path, target_second=0, sample_rate=22050):
+    try:
+        # Load audio file
+        audio_data, sr = librosa.load(file_path, sr=sample_rate)
+        # Calculate start and end samples for the target second
+        start_sample = target_second * sr
+        end_sample = (target_second + 1) * sr
+        # Ensure we don't go beyond the audio length
+        if start_sample >= len(audio_data):
+            raise ValueError(f"Target second {target_second} is beyond audio length")
+        end_sample = min(end_sample, len(audio_data))
+        # Extract the second
+        second_audio = audio_data[start_sample:end_sample]
+        # If the audio is shorter than 1 second, pad with zeros
+        if len(second_audio) < sr:
+            second_audio = np.pad(second_audio, (0, sr - len(second_audio)))
+        return second_audio, sr
+    except Exception as e:
+        print(f"Error processing audio: {e}")
+        return None, None

real.py ADDED Viewed

	@@ -0,0 +1,1572 @@

+import pandas as pd
+import streamlit as st
+import cv2
+import numpy as np
+import mediapipe as mp
+from pathlib import Path
+from scipy.signal import find_peaks, savgol_filter
+import json
+import subprocess
+import os
+import soundfile as sf
+from datetime import datetime
+import tempfile
+from ultralytics import YOLO
+from agent import process_video_for_footstep_audio
+from sound_agent import main_sound
+from qsec import extract_second_audio_librosa
+import threading
+import queue
+import time
+from PIL import Image
+import io
+# Suppress TensorFlow warnings
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+import absl.logging
+absl.logging.set_verbosity(absl.logging.ERROR)
+def get_ffmpeg_path():
+    """Get FFmpeg path with multiple fallback options"""
+    possible_paths = [
+        "ffmpeg",  # Try system ffmpeg first (Docker/Linux)
+        r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe",  # Local Windows path
+        "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe",  # Relative path
+    ]
+    for path in possible_paths:
+        if path == "ffmpeg":
+            try:
+                result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
+                if result.returncode == 0:
+                    return path
+            except:
+                continue
+        else:
+            if os.path.exists(path):
+                return path
+    return "ffmpeg"  # Default to system ffmpeg
+FFMPEG_PATH = get_ffmpeg_path()
+# Streamlit Configuration
+st.set_page_config(
+    page_title="Hybrid YOLO-MediaPipe Footstep Detection",
+    page_icon="🎬",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: 700;
+        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        margin-bottom: 2rem;
+    }
+    .metric-card {
+        background: #f0f2f6;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        border-left: 4px solid #667eea;
+    }
+    .success-box {
+        padding: 1rem;
+        background: #d4edda;
+        border: 1px solid #c3e6cb;
+        border-radius: 0.5rem;
+        color: #155724;
+    }
+    .hybrid-badge {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 0.5rem 1rem;
+        border-radius: 20px;
+        display: inline-block;
+        font-weight: 600;
+        margin: 1rem 0;
+    }
+    .live-indicator {
+        background: #dc3545;
+        color: white;
+        padding: 0.5rem 1rem;
+        border-radius: 20px;
+        display: inline-block;
+        font-weight: 600;
+        animation: pulse 1.5s infinite;
+    }
+    @keyframes pulse {
+        0%, 100% { opacity: 1; }
+        50% { opacity: 0.5; }
+    }
+    .ready-badge {
+        background: #28a745;
+        color: white;
+        padding: 0.5rem 1rem;
+        border-radius: 20px;
+        display: inline-block;
+        font-weight: 600;
+    }
+</style>
+""", unsafe_allow_html=True)
+class LiveFootstepDetector:
+    """Real-time footstep detection for live camera feed"""
+    def __init__(self, audio_path, sensitivity='medium', yolo_conf=0.5):
+        self.audio_path = audio_path
+        self.sensitivity = sensitivity
+        self.yolo_conf = yolo_conf
+        self.running = False
+        self.audio_ready = False
+        # Load footstep audio
+        try:
+            self.footstep_audio, self.sample_rate = extract_second_audio_librosa(
+                file_path=audio_path,
+                target_second=5,
+                sample_rate=44100
+            )
+            self.audio_ready = True
+        except Exception as e:
+            st.error(f"Failed to load audio: {str(e)}")
+            self.audio_ready = False
+        # Initialize detection models
+        try:
+            self.yolo_model = YOLO('yolov8n.pt')
+            self.mp_pose = mp.solutions.pose
+            self.pose = self.mp_pose.Pose(
+                static_image_mode=False,
+                model_complexity=1,
+                smooth_landmarks=True,
+                min_detection_confidence=0.5,
+                min_tracking_confidence=0.5
+            )
+        except Exception as e:
+            st.error(f"Failed to initialize models: {str(e)}")
+            return
+        # Landmark indices
+        self.LEFT_HEEL = 29
+        self.RIGHT_HEEL = 30
+        # Detection thresholds
+        self.thresholds = {
+            'low': {'prominence': 0.02, 'velocity_threshold': 0.015},
+            'medium': {'prominence': 0.015, 'velocity_threshold': 0.012},
+            'high': {'prominence': 0.01, 'velocity_threshold': 0.010}
+        }[sensitivity]
+        # Tracking state
+        self.prev_left_y = None
+        self.prev_right_y = None
+        self.prev_time = None
+        self.left_buffer = []
+        self.right_buffer = []
+        self.buffer_size = 10
+        # Audio playback
+        self.audio_queue = queue.Queue()
+        self.audio_thread = None
+    def start_audio_playback(self):
+        """Start audio playback thread"""
+        if not self.audio_ready:
+            return
+        def play_audio():
+            import pyaudio
+            p = pyaudio.PyAudio()
+            stream = p.open(
+                format=pyaudio.paFloat32,
+                channels=1,
+                rate=self.sample_rate,
+                output=True
+            )
+            while self.running:
+                try:
+                    foot = self.audio_queue.get(timeout=0.1)
+                    # Play footstep sound
+                    stream.write(self.footstep_audio.astype(np.float32).tobytes())
+                except queue.Empty:
+                    continue
+                except Exception as e:
+                    print(f"Audio playback error: {e}")
+            stream.stop_stream()
+            stream.close()
+            p.terminate()
+        self.audio_thread = threading.Thread(target=play_audio, daemon=True)
+        self.audio_thread.start()
+    def detect_heel_strike(self, current_y, prev_y, foot_buffer):
+        """Detect heel strike based on vertical velocity and position"""
+        if prev_y is None:
+            return False
+        # Calculate vertical velocity (downward is positive)
+        velocity = current_y - prev_y
+        # Add to buffer
+        foot_buffer.append(current_y)
+        if len(foot_buffer) > self.buffer_size:
+            foot_buffer.pop(0)
+        if len(foot_buffer) < 5:
+            return False
+        # Detect strike: downward movement followed by stabilization
+        # Current position is low (heel on ground)
+        # Recent movement was downward
+        # Velocity is slowing (strike impact)
+        recent_velocities = [foot_buffer[i + 1] - foot_buffer[i]
+                             for i in range(len(foot_buffer) - 1)]
+        avg_velocity = np.mean(recent_velocities[-3:]) if len(recent_velocities) >= 3 else 0
+        is_strike = (
+                current_y > 0.7 and  # Heel is low in frame
+                velocity > self.thresholds['velocity_threshold'] and  # Moving down
+                avg_velocity < velocity * 0.5  # Velocity decreasing (impact)
+        )
+        return is_strike
+    def process_frame(self, frame):
+        """Process single frame and detect footsteps"""
+        if not self.audio_ready:
+            return frame, None
+        detected_foot = None
+        try:
+            # YOLO detection
+            results = self.yolo_model(frame, conf=self.yolo_conf, classes=[0], verbose=False)
+            person_detected = False
+            bbox = None
+            for result in results:
+                boxes = result.boxes
+                if len(boxes) > 0:
+                    person_detected = True
+                    box = boxes[0]  # Take first person
+                    x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                    bbox = (int(x1), int(y1), int(x2), int(y2))
+                    # Draw YOLO bbox
+                    cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                                  (255, 255, 0), 2)
+                    break
+            # MediaPipe pose estimation
+            if person_detected and bbox:
+                # Crop to person region with padding
+                x1, y1, x2, y2 = bbox
+                pad = 20
+                x1 = max(0, x1 - pad)
+                y1 = max(0, y1 - pad)
+                x2 = min(frame.shape[1], x2 + pad)
+                y2 = min(frame.shape[0], y2 + pad)
+                cropped = frame[y1:y2, x1:x2]
+                rgb_frame = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
+                pose_results = self.pose.process(rgb_frame)
+                if pose_results.pose_landmarks:
+                    landmarks = pose_results.pose_landmarks.landmark
+                    # Get heel positions (adjusted to full frame)
+                    left_heel = landmarks[self.LEFT_HEEL]
+                    right_heel = landmarks[self.RIGHT_HEEL]
+                    left_y = (left_heel.y * (y2 - y1) + y1) / frame.shape[0]
+                    right_y = (right_heel.y * (y2 - y1) + y1) / frame.shape[0]
+                    # Detect strikes
+                    left_strike = self.detect_heel_strike(
+                        left_y, self.prev_left_y, self.left_buffer
+                    )
+                    right_strike = self.detect_heel_strike(
+                        right_y, self.prev_right_y, self.right_buffer
+                    )
+                    if left_strike:
+                        detected_foot = 'LEFT'
+                        self.audio_queue.put('LEFT')
+                    elif right_strike:
+                        detected_foot = 'RIGHT'
+                        self.audio_queue.put('RIGHT')
+                    # Update previous positions
+                    self.prev_left_y = left_y
+                    self.prev_right_y = right_y
+                    # Draw skeleton on full frame
+                    for landmark in landmarks:
+                        x = int((landmark.x * (x2 - x1) + x1))
+                        y = int((landmark.y * (y2 - y1) + y1))
+                        cv2.circle(frame, (x, y), 3, (0, 255, 0), -1)
+                    # Highlight heels
+                    left_heel_x = int((left_heel.x * (x2 - x1) + x1))
+                    left_heel_y = int((left_heel.y * (y2 - y1) + y1))
+                    right_heel_x = int((right_heel.x * (x2 - x1) + x1))
+                    right_heel_y = int((right_heel.y * (y2 - y1) + y1))
+                    cv2.circle(frame, (left_heel_x, left_heel_y), 8, (0, 255, 0), -1)
+                    cv2.circle(frame, (right_heel_x, right_heel_y), 8, (0, 100, 255), -1)
+                    if detected_foot:
+                        # Show strike indicator
+                        heel_x = left_heel_x if detected_foot == 'LEFT' else right_heel_x
+                        heel_y = left_heel_y if detected_foot == 'LEFT' else right_heel_y
+                        color = (0, 255, 0) if detected_foot == 'LEFT' else (0, 100, 255)
+                        cv2.circle(frame, (heel_x, heel_y), 30, color, 3)
+                        cv2.putText(frame, f"{detected_foot} STRIKE!",
+                                    (heel_x - 50, heel_y - 40),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
+            # Draw status
+            status_text = "READY" if self.audio_ready else "NO AUDIO"
+            status_color = (0, 255, 0) if self.audio_ready else (0, 0, 255)
+            cv2.rectangle(frame, (10, 10), (150, 50), (0, 0, 0), -1)
+            cv2.putText(frame, status_text, (20, 35),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, status_color, 2)
+        except Exception as e:
+            print(f"Frame processing error: {e}")
+        return frame, detected_foot
+    def start(self):
+        """Start the detector"""
+        self.running = True
+        self.start_audio_playback()
+    def stop(self):
+        """Stop the detector"""
+        self.running = False
+        if self.audio_thread:
+            self.audio_thread.join(timeout=2)
+class HybridFootstepDetectionPipeline:
+    """
+    Hybrid Detection Pipeline for video files:
+    1. YOLO detects person bounding boxes
+    2. MediaPipe estimates pose on detected regions
+    3. Track footsteps with improved accuracy
+    """
+    def __init__(self, fps=30, sensitivity='medium', yolo_conf=0.5):
+        self.fps = fps
+        self.sensitivity = sensitivity
+        self.yolo_conf = yolo_conf
+        # Initialize YOLO detector
+        try:
+            self.yolo_model = YOLO('yolov8n.pt')
+            st.success("✅ YOLO detector loaded successfully")
+        except Exception as e:
+            st.warning(f"⚠️ YOLO loading issue: {str(e)}. Downloading model...")
+            try:
+                self.yolo_model = YOLO('yolov8n.pt')
+                st.success("✅ YOLO detector loaded successfully")
+            except Exception as e2:
+                st.error(f"❌ Failed to load YOLO: {str(e2)}")
+                self.yolo_model = None
+        # Initialize MediaPipe pose estimator
+        try:
+            self.mp_pose = mp.solutions.pose
+            self.pose = self.mp_pose.Pose(
+                static_image_mode=False,
+                model_complexity=1,
+                smooth_landmarks=True,
+                min_detection_confidence=0.5,
+                min_tracking_confidence=0.5
+            )
+            st.success("✅ MediaPipe pose estimator loaded successfully")
+        except Exception as e:
+            st.error(f"❌ Failed to initialize MediaPipe: {str(e)}")
+            self.pose = None
+        # Landmark indices
+        self.LEFT_HEEL = 29
+        self.RIGHT_HEEL = 30
+        self.LEFT_ANKLE = 27
+        self.RIGHT_ANKLE = 28
+        # Detection thresholds
+        self.thresholds = {
+            'low': {'prominence': 0.02, 'min_interval': 0.4},
+            'medium': {'prominence': 0.015, 'min_interval': 0.3},
+            'high': {'prominence': 0.01, 'min_interval': 0.25}
+        }[sensitivity]
+        # Tracking state
+        self.person_tracker = PersonTracker()
+    def detect_person_yolo(self, frame):
+        """Detect person using YOLO"""
+        if self.yolo_model is None:
+            return []
+        try:
+            results = self.yolo_model(frame, conf=self.yolo_conf, classes=[0], verbose=False)
+            person_boxes = []
+            for result in results:
+                boxes = result.boxes
+                for box in boxes:
+                    x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                    conf = box.conf[0].cpu().numpy()
+                    person_boxes.append((int(x1), int(y1), int(x2), int(y2), float(conf)))
+            return person_boxes
+        except Exception as e:
+            st.warning(f"YOLO detection failed: {str(e)}")
+            return []
+    def estimate_pose_mediapipe(self, frame, bbox=None):
+        """Estimate pose using MediaPipe on specified region"""
+        if self.pose is None:
+            return None
+        try:
+            if bbox is not None:
+                x1, y1, x2, y2 = bbox
+                pad = 20
+                x1 = max(0, x1 - pad)
+                y1 = max(0, y1 - pad)
+                x2 = min(frame.shape[1], x2 + pad)
+                y2 = min(frame.shape[0], y2 + pad)
+                cropped = frame[y1:y2, x1:x2]
+                if cropped.size == 0:
+                    return None
+                rgb_frame = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
+                results = self.pose.process(rgb_frame)
+                if results.pose_landmarks:
+                    for landmark in results.pose_landmarks.landmark:
+                        landmark.x = (landmark.x * (x2 - x1) + x1) / frame.shape[1]
+                        landmark.y = (landmark.y * (y2 - y1) + y1) / frame.shape[0]
+                return results
+            else:
+                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                return self.pose.process(rgb_frame)
+        except Exception as e:
+            return None
+    def process_video(self, video_path, progress_callback=None):
+        """Process video with hybrid YOLO-MediaPipe pipeline"""
+        if self.yolo_model is None or self.pose is None:
+            st.error("❌ Detection models not available")
+            return None
+        cap = cv2.VideoCapture(str(video_path))
+        if not cap.isOpened():
+            st.error("❌ Could not open video file")
+            return None
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        if fps <= 0 or total_frames <= 0:
+            st.error("❌ Invalid video properties")
+            cap.release()
+            return None
+        left_positions = []
+        right_positions = []
+        detection_confidence = []
+        frame_idx = 0
+        yolo_detections = 0
+        pose_detections = 0
+        st.info(f"🔄 Processing with Hybrid Pipeline: {total_frames} frames")
+        try:
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                person_boxes = self.detect_person_yolo(frame)
+                if person_boxes:
+                    yolo_detections += 1
+                    best_box = self.person_tracker.select_best_person(person_boxes, frame_idx)
+                    bbox = best_box[:4]
+                    results = self.estimate_pose_mediapipe(frame, bbox)
+                    if results and results.pose_landmarks:
+                        pose_detections += 1
+                        landmarks = results.pose_landmarks.landmark
+                        left_y = landmarks[self.LEFT_HEEL].y
+                        right_y = landmarks[self.RIGHT_HEEL].y
+                        conf = (landmarks[self.LEFT_HEEL].visibility +
+                                landmarks[self.RIGHT_HEEL].visibility) / 2
+                        left_positions.append(left_y)
+                        right_positions.append(right_y)
+                        detection_confidence.append(conf)
+                    else:
+                        left_positions.append(np.nan)
+                        right_positions.append(np.nan)
+                        detection_confidence.append(0.0)
+                else:
+                    results = self.estimate_pose_mediapipe(frame, bbox=None)
+                    if results and results.pose_landmarks:
+                        pose_detections += 1
+                        landmarks = results.pose_landmarks.landmark
+                        left_positions.append(landmarks[self.LEFT_HEEL].y)
+                        right_positions.append(landmarks[self.RIGHT_HEEL].y)
+                        detection_confidence.append(0.5)
+                    else:
+                        left_positions.append(np.nan)
+                        right_positions.append(np.nan)
+                        detection_confidence.append(0.0)
+                frame_idx += 1
+                if progress_callback and frame_idx % 10 == 0:
+                    progress = min(frame_idx / total_frames, 1.0)
+                    progress_callback(progress)
+        except Exception as e:
+            st.error(f"❌ Video processing error: {str(e)}")
+            cap.release()
+            return None
+        cap.release()
+        st.info(
+            f"📊 YOLO detections: {yolo_detections}/{total_frames} frames ({yolo_detections / total_frames * 100:.1f}%)")
+        st.info(
+            f"📊 Pose detections: {pose_detections}/{total_frames} frames ({pose_detections / total_frames * 100:.1f}%)")
+        if len(left_positions) == 0:
+            st.error("❌ No frames processed successfully")
+            return None
+        try:
+            left_series = pd.Series(left_positions).interpolate(method='linear')
+            left_series = left_series.bfill().ffill()
+            left_positions = left_series.values
+            right_series = pd.Series(right_positions).interpolate(method='linear')
+            right_series = right_series.bfill().ffill()
+            right_positions = right_series.values
+            if len(left_positions) > 5:
+                window = min(11, len(left_positions) if len(left_positions) % 2 == 1 else len(left_positions) - 1)
+                if window >= 3:
+                    left_positions = savgol_filter(left_positions, window, 2)
+                    right_positions = savgol_filter(right_positions, window, 2)
+            left_strikes = self._detect_strikes(left_positions, fps)
+            right_strikes = self._detect_strikes(right_positions, fps)
+            events = []
+            for frame in left_strikes:
+                events.append({
+                    'frame': int(frame),
+                    'timecode': self._frames_to_smpte(frame, fps),
+                    'foot': 'LEFT',
+                    'event': 'HEEL_STRIKE',
+                    'time_seconds': frame / fps,
+                    'confidence': detection_confidence[int(frame)] if int(frame) < len(detection_confidence) else 0.5
+                })
+            for frame in right_strikes:
+                events.append({
+                    'frame': int(frame),
+                    'timecode': self._frames_to_smpte(frame, fps),
+                    'foot': 'RIGHT',
+                    'event': 'HEEL_STRIKE',
+                    'time_seconds': frame / fps,
+                    'confidence': detection_confidence[int(frame)] if int(frame) < len(detection_confidence) else 0.5
+                })
+            events = sorted(events, key=lambda x: x['frame'])
+            return {
+                'events': events,
+                'fps': fps,
+                'total_frames': total_frames,
+                'width': width,
+                'height': height,
+                'left_positions': left_positions.tolist() if hasattr(left_positions, 'tolist') else left_positions,
+                'right_positions': right_positions.tolist() if hasattr(right_positions, 'tolist') else right_positions,
+                'detection_stats': {
+                    'yolo_detections': yolo_detections,
+                    'pose_detections': pose_detections,
+                    'total_frames': total_frames
+                }
+            }
+        except Exception as e:
+            st.error(f"❌ Data processing error: {str(e)}")
+            return None
+    def _detect_strikes(self, positions, fps):
+        """Detect heel strikes from position data"""
+        try:
+            peaks, _ = find_peaks(
+                positions,
+                prominence=self.thresholds['prominence'],
+                distance=int(fps * self.thresholds['min_interval']),
+                height=0.7
+            )
+            return peaks
+        except Exception as e:
+            st.warning(f"Peak detection failed: {str(e)}")
+            return np.array([])
+    def _frames_to_smpte(self, frame, fps):
+        """Convert frame number to SMPTE timecode"""
+        total_seconds = frame / fps
+        hours = int(total_seconds // 3600)
+        minutes = int((total_seconds % 3600) // 60)
+        seconds = int(total_seconds % 60)
+        frames = int((total_seconds * fps) % fps)
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d}:{frames:02d}"
+class PersonTracker:
+    """Track person across frames for consistency"""
+    def __init__(self, iou_threshold=0.3):
+        self.tracked_box = None
+        self.last_frame = -1
+        self.iou_threshold = iou_threshold
+    def calculate_iou(self, box1, box2):
+        """Calculate IoU between two bounding boxes"""
+        x1_1, y1_1, x2_1, y2_1 = box1[:4]
+        x1_2, y1_2, x2_2, y2_2 = box2[:4]
+        xi1 = max(x1_1, x1_2)
+        yi1 = max(y1_1, y1_2)
+        xi2 = min(x2_1, x2_2)
+        yi2 = min(y2_1, y2_2)
+        inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
+        box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
+        box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
+        union_area = box1_area + box2_area - inter_area
+        return inter_area / union_area if union_area > 0 else 0
+    def select_best_person(self, person_boxes, frame_idx):
+        """Select best person box for tracking consistency"""
+        if not person_boxes:
+            return None
+        if self.tracked_box is not None and frame_idx - self.last_frame < 10:
+            max_iou = 0
+            best_box = None
+            for box in person_boxes:
+                iou = self.calculate_iou(self.tracked_box, box)
+                if iou > max_iou:
+                    max_iou = iou
+                    best_box = box
+            if max_iou > self.iou_threshold:
+                self.tracked_box = best_box
+                self.last_frame = frame_idx
+                return best_box
+        best_box = max(person_boxes, key=lambda x: (x[2] - x[0]) * (x[3] - x[1]) * x[4])
+        self.tracked_box = best_box
+        self.last_frame = frame_idx
+        return best_box
+class AudioGenerator:
+    """Generate footstep audio"""
+    def __init__(self, sample_rate=44100):
+        self.sample_rate = sample_rate
+    def generate_footstep(self, aud_path):
+        arr, rate = extract_second_audio_librosa(
+            file_path=aud_path,
+            target_second=5,
+            sample_rate=self.sample_rate
+        )
+        return arr
+    def create_audio_track(self, events, aud_path, duration=0.3):
+        total_samples = int(duration * self.sample_rate)
+        audio_track = np.zeros(total_samples, dtype=np.float32)
+        for i, event in enumerate(events):
+            step_sound = self.generate_footstep(aud_path)
+            pitch_shift = 1.0 + (i % 5 - 2) * 0.03
+            indices = np.arange(len(step_sound)) * pitch_shift
+            indices = np.clip(indices, 0, len(step_sound) - 1).astype(int)
+            step_sound = step_sound[indices]
+            start_sample = int(event['time_seconds'] * self.sample_rate)
+            end_sample = min(start_sample + len(step_sound), total_samples)
+            sound_len = end_sample - start_sample
+            if sound_len > 0:
+                audio_track[start_sample:end_sample] += step_sound[:sound_len]
+        max_val = np.max(np.abs(audio_track))
+        if max_val > 0:
+            audio_track = audio_track / max_val * 0.8
+        return audio_track
+def create_annotated_video(input_path, events, output_path, use_hybrid=True, progress_callback=None):
+    """Create annotated video with hybrid detection visualization"""
+    try:
+        cap = cv2.VideoCapture(str(input_path))
+        if not cap.isOpened():
+            st.error("❌ Could not open input video file")
+            return False
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
+        if not out.isOpened():
+            st.error("❌ Could not create output video file")
+            cap.release()
+            return False
+        event_frames = {e['frame']: e for e in events}
+        if use_hybrid:
+            yolo_model = YOLO('yolov8n.pt')
+            mp_pose = mp.solutions.pose
+            pose = mp_pose.Pose(
+                static_image_mode=False,
+                model_complexity=1,
+                smooth_landmarks=True,
+                min_detection_confidence=0.5,
+                min_tracking_confidence=0.5
+            )
+        else:
+            yolo_model = None
+            mp_pose = mp.solutions.pose
+            pose = mp_pose.Pose(
+                static_image_mode=False,
+                model_complexity=1,
+                smooth_landmarks=True,
+                min_detection_confidence=0.5,
+                min_tracking_confidence=0.5
+            )
+        frame_idx = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            try:
+                if use_hybrid and yolo_model:
+                    results = yolo_model(frame, conf=0.5, classes=[0], verbose=False)
+                    for result in results:
+                        boxes = result.boxes
+                        for box in boxes:
+                            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                            conf = box.conf[0].cpu().numpy()
+                            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)),
+                                          (255, 255, 0), 2)
+                            cv2.putText(frame, f'YOLO: {conf:.2f}',
+                                        (int(x1), int(y1) - 10),
+                                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2)
+                results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+                if results.pose_landmarks:
+                    mp.solutions.drawing_utils.draw_landmarks(
+                        frame,
+                        results.pose_landmarks,
+                        mp_pose.POSE_CONNECTIONS,
+                        landmark_drawing_spec=mp.solutions.drawing_utils.DrawingSpec(
+                            color=(0, 255, 0), thickness=2, circle_radius=2
+                        ),
+                        connection_drawing_spec=mp.solutions.drawing_utils.DrawingSpec(
+                            color=(255, 255, 255), thickness=2
+                        )
+                    )
+                if frame_idx in event_frames:
+                    event = event_frames[frame_idx]
+                    banner_height = 100
+                    cv2.rectangle(frame, (0, 0), (width, banner_height), (0, 0, 0), -1)
+                    text = f"{event['foot']} HEEL STRIKE"
+                    color = (0, 255, 0) if event['foot'] == 'LEFT' else (0, 100, 255)
+                    cv2.putText(frame, text, (50, 50),
+                                cv2.FONT_HERSHEY_SIMPLEX, 1.5, color, 3)
+                    if 'confidence' in event:
+                        conf_text = f"Conf: {event['confidence']:.2f}"
+                        cv2.putText(frame, conf_text, (50, 85),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
+                    circle_x = 50 if event['foot'] == 'LEFT' else width - 50
+                    cv2.circle(frame, (circle_x, height - 100), 40, color, -1)
+                if use_hybrid:
+                    cv2.rectangle(frame, (width - 250, 10), (width - 10, 50), (102, 126, 234), -1)
+                    cv2.putText(frame, "HYBRID MODE", (width - 240, 35),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
+                time_seconds = frame_idx / fps
+                hours = int(time_seconds // 3600)
+                minutes = int((time_seconds % 3600) // 60)
+                seconds = int(time_seconds % 60)
+                frame_num = int((time_seconds * fps) % fps)
+                timecode = f"TC: {hours:02d}:{minutes:02d}:{seconds:02d}:{frame_num:02d}"
+                cv2.rectangle(frame, (0, height - 80), (400, height), (0, 0, 0), -1)
+                cv2.putText(frame, timecode, (10, height - 30),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
+                cv2.putText(frame, f"Frame: {frame_idx}/{total_frames}", (10, height - 55),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+                out.write(frame)
+                frame_idx += 1
+                if progress_callback and frame_idx % 5 == 0:
+                    progress = min(frame_idx / total_frames, 1.0)
+                    progress_callback(progress)
+            except Exception as e:
+                st.warning(f"⚠️ Error processing frame {frame_idx}: {str(e)}")
+                frame_idx += 1
+                continue
+        cap.release()
+        out.release()
+        pose.close()
+        return True
+    except Exception as e:
+        st.error(f"❌ Video annotation failed: {str(e)}")
+        try:
+            cap.release()
+            out.release()
+            pose.close()
+        except:
+            pass
+        return False
+def merge_audio_with_video(video_path, audio_track, sample_rate, output_path):
+    """Merge audio with video using FFmpeg"""
+    temp_audio = tempfile.mktemp(suffix='.wav')
+    sf.write(temp_audio, audio_track, sample_rate)
+    ffmpeg_cmd = FFMPEG_PATH if FFMPEG_PATH else "ffmpeg"
+    cmd = [
+        ffmpeg_cmd, '-y',
+        '-i', str(video_path),
+        '-i', temp_audio,
+        '-map', '0:v', '-map', '1:a',
+        '-c:v', 'libx264', '-preset', 'medium',
+        '-c:a', 'aac', '-b:a', '192k',
+        '-shortest',
+        str(output_path)
+    ]
+    try:
+        if FFMPEG_PATH is None:
+            st.warning("FFmpeg not found. Using fallback method.")
+            return None
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=30)
+        return True
+    except subprocess.CalledProcessError as e:
+        st.error(f"FFmpeg error: {e.stderr}")
+        return False
+    except subprocess.TimeoutExpired:
+        st.error("FFmpeg timed out")
+        return False
+    finally:
+        if os.path.exists(temp_audio):
+            os.remove(temp_audio)
+def live_streaming_mode():
+    """Live streaming mode with frame capture and real-time detection"""
+    st.markdown('<h2>📹 Live Streaming Mode</h2>', unsafe_allow_html=True)
+    st.info("🎥 This mode allows real-time footstep detection with your device camera")
+    # Initialize session state
+    if 'floor_frame_captured' not in st.session_state:
+        st.session_state.floor_frame_captured = False
+    if 'audio_downloaded' not in st.session_state:
+        st.session_state.audio_downloaded = False
+    if 'live_audio_path' not in st.session_state:
+        st.session_state.live_audio_path = None
+    if 'live_detector' not in st.session_state:
+        st.session_state.live_detector = None
+    if 'camera_active' not in st.session_state:
+        st.session_state.camera_active = False
+    # Step 1: Capture floor frame
+    st.markdown("### Step 1: Capture Floor Frame 📸")
+    st.write("Capture a single frame showing the floor surface for audio analysis")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        # Camera input for frame capture
+        camera_image = st.camera_input("Capture floor image", key="floor_capture")
+        if camera_image is not None and not st.session_state.floor_frame_captured:
+            # Save captured frame
+            image = Image.open(camera_image)
+            temp_frame_path = tempfile.mktemp(suffix='.jpg')
+            image.save(temp_frame_path)
+            st.session_state.floor_frame_path = temp_frame_path
+            # Display captured frame
+            st.image(image, caption="Captured Floor Frame", use_container_width=True)
+            if st.button("✅ Confirm Floor Capture", type="primary", use_container_width=True):
+                st.session_state.floor_frame_captured = True
+                st.success("✅ Floor frame captured successfully!")
+                st.rerun()
+    with col2:
+        if st.session_state.floor_frame_captured:
+            st.markdown('<div class="success-box">✅ Floor Captured</div>', unsafe_allow_html=True)
+        else:
+            st.info("📸 Capture floor frame to proceed")
+    # Step 2: Analyze and download audio
+    if st.session_state.floor_frame_captured and not st.session_state.audio_downloaded:
+        st.markdown("---")
+        st.markdown("### Step 2: Analyze Floor & Download Audio 🔊")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            if st.button("🔍 Analyze Floor & Generate Audio", type="primary", use_container_width=True):
+                with st.spinner("🔄 Analyzing floor surface and generating audio..."):
+                    try:
+                        # Create temporary video from frame for processing
+                        temp_video = tempfile.mktemp(suffix='.mp4')
+                        # Create 1-second video from the captured frame
+                        img = cv2.imread(st.session_state.floor_frame_path)
+                        height, width = img.shape[:2]
+                        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+                        out = cv2.VideoWriter(temp_video, fourcc, 30, (width, height))
+                        # Write 30 frames (1 second at 30fps)
+                        for _ in range(30):
+                            out.write(img)
+                        out.release()
+                        # Process video for footstep audio
+                        st.info("🎵 Generating footstep audio based on floor analysis...")
+                        aud_name = process_video_for_footstep_audio(temp_video)
+                        aud_dict = main_sound(aud_name)
+                        aud_path = aud_dict['default'].replace(".%(ext)s", ".mp3")
+                        st.session_state.live_audio_path = aud_path
+                        st.session_state.audio_downloaded = True
+                        # Clean up temp video
+                        if os.path.exists(temp_video):
+                            os.remove(temp_video)
+                        st.success("✅ Audio generated successfully!")
+                        st.balloons()
+                        st.rerun()
+                    except Exception as e:
+                        st.error(f"❌ Error generating audio: {str(e)}")
+        with col2:
+            st.info("🎵 Audio will be generated based on floor type")
+    # Step 3: Initialize live detector
+    if st.session_state.audio_downloaded and st.session_state.live_detector is None:
+        st.markdown("---")
+        st.markdown("### Step 3: Initialize Live Detection 🚀")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            sensitivity = st.select_slider(
+                "Detection Sensitivity",
+                options=['low', 'medium', 'high'],
+                value='medium'
+            )
+            yolo_conf = st.slider(
+                "YOLO Confidence",
+                min_value=0.1,
+                max_value=0.9,
+                value=0.5,
+                step=0.05
+            )
+            if st.button("🎬 Initialize Live Detector", type="primary", use_container_width=True):
+                with st.spinner("⚙️ Initializing detector..."):
+                    try:
+                        detector = LiveFootstepDetector(
+                            audio_path=st.session_state.live_audio_path,
+                            sensitivity=sensitivity,
+                            yolo_conf=yolo_conf
+                        )
+                        st.session_state.live_detector = detector
+                        st.success("✅ Live detector initialized!")
+                        st.rerun()
+                    except Exception as e:
+                        st.error(f"❌ Failed to initialize detector: {str(e)}")
+        with col2:
+            st.info("🤖 Configure detection parameters")
+    # Step 4: Start live detection
+    if st.session_state.live_detector is not None:
+        st.markdown("---")
+        st.markdown('<div class="ready-badge">✅ SYSTEM READY</div>', unsafe_allow_html=True)
+        st.markdown("### Step 4: Live Detection 🎯")
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            st.write("📹 **Camera is ready for live footstep detection**")
+            st.write("🚶 Walk in front of the camera and hear footsteps in real-time!")
+            # Start/Stop controls
+            col_a, col_b = st.columns(2)
+            with col_a:
+                if not st.session_state.camera_active:
+                    if st.button("▶️ Start Live Detection", type="primary", use_container_width=True):
+                        st.session_state.camera_active = True
+                        st.session_state.live_detector.start()
+                        st.rerun()
+            with col_b:
+                if st.session_state.camera_active:
+                    if st.button("⏹️ Stop Detection", type="secondary", use_container_width=True):
+                        st.session_state.camera_active = False
+                        st.session_state.live_detector.stop()
+                        st.rerun()
+        with col2:
+            if st.session_state.camera_active:
+                st.markdown('<div class="live-indicator">🔴 LIVE</div>', unsafe_allow_html=True)
+            else:
+                st.info("⏸️ Paused")
+        # Live video feed
+        if st.session_state.camera_active:
+            st.markdown("---")
+            FRAME_WINDOW = st.image([])
+            cap = cv2.VideoCapture(0)
+            if not cap.isOpened():
+                st.error("❌ Cannot access camera. Please check permissions.")
+                st.session_state.camera_active = False
+            else:
+                st.info("📹 Live feed active - Walk to generate footsteps!")
+                # Statistics
+                step_counter = st.empty()
+                left_steps = 0
+                right_steps = 0
+                try:
+                    while st.session_state.camera_active:
+                        ret, frame = cap.read()
+                        if not ret:
+                            st.error("❌ Failed to read from camera")
+                            break
+                        # Process frame
+                        processed_frame, detected_foot = st.session_state.live_detector.process_frame(frame)
+                        # Update counters
+                        if detected_foot == 'LEFT':
+                            left_steps += 1
+                        elif detected_foot == 'RIGHT':
+                            right_steps += 1
+                        # Display frame
+                        FRAME_WINDOW.image(cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB))
+                        # Update statistics
+                        step_counter.metric("Total Steps Detected", left_steps + right_steps,
+                                            f"L: {left_steps} | R: {right_steps}")
+                        # Check if user stopped
+                        if not st.session_state.camera_active:
+                            break
+                        time.sleep(0.033)  # ~30 FPS
+                except Exception as e:
+                    st.error(f"❌ Error during live detection: {str(e)}")
+                finally:
+                    cap.release()
+                    st.session_state.live_detector.stop()
+        # Reset button
+        st.markdown("---")
+        if st.button("🔄 Reset All", use_container_width=True):
+            st.session_state.floor_frame_captured = False
+            st.session_state.audio_downloaded = False
+            st.session_state.live_audio_path = None
+            st.session_state.live_detector = None
+            st.session_state.camera_active = False
+            st.rerun()
+def video_upload_mode():
+    """Original video upload mode"""
+    st.markdown('<h2>📤 Video Upload Mode</h2>', unsafe_allow_html=True)
+    # Sidebar configuration
+    sensitivity = st.sidebar.select_slider(
+        "Footstep Sensitivity",
+        options=['low', 'medium', 'high'],
+        value='medium',
+        help="Higher sensitivity detects more subtle footsteps"
+    )
+    yolo_conf = st.sidebar.slider(
+        "YOLO Confidence",
+        min_value=0.1,
+        max_value=0.9,
+        value=0.5,
+        step=0.05,
+        help="Confidence threshold for YOLO person detection"
+    )
+    surface_type = st.sidebar.selectbox(
+        "Surface Type",
+        ['concrete', 'wood', 'grass', 'gravel', 'metal'],
+        help="Select surface for audio generation"
+    )
+    use_hybrid = st.sidebar.checkbox(
+        "Enable Hybrid Mode",
+        value=True,
+        help="Use YOLO for person detection + MediaPipe for pose estimation"
+    )
+    create_annotated = st.sidebar.checkbox("Create Annotated Video", value=True)
+    add_audio = st.sidebar.checkbox("Add Footstep Audio", value=True)
+    # File uploader
+    uploaded_file = st.file_uploader(
+        "📤 Upload Video File",
+        type=['mp4', 'avi', 'mov', 'mkv'],
+        help="Upload a video file to detect footsteps"
+    )
+    if uploaded_file:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
+            tmp_file.write(uploaded_file.read())
+            video_path = tmp_file.name
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.subheader("📹 Input Video")
+            st.video(video_path)
+        with col2:
+            st.subheader("ℹ️ Video Info")
+            cap = cv2.VideoCapture(video_path)
+            video_info = {
+                "Duration": f"{cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS):.2f}s",
+                "FPS": f"{cap.get(cv2.CAP_PROP_FPS):.2f}",
+                "Resolution": f"{int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))}x{int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))}",
+                "Frames": int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            }
+            cap.release()
+            for key, value in video_info.items():
+                st.metric(key, value)
+            if use_hybrid:
+                st.success("🤖 Hybrid Mode Active")
+            else:
+                st.info("📊 MediaPipe Only")
+        st.markdown("---")
+        if st.button("🚀 Process Video", type="primary", use_container_width=True):
+            if use_hybrid:
+                st.info("🔄 Running Hybrid YOLO-MediaPipe Pipeline...")
+                pipeline = HybridFootstepDetectionPipeline(
+                    fps=float(video_info["FPS"]),
+                    sensitivity=sensitivity,
+                    yolo_conf=yolo_conf
+                )
+            else:
+                st.info("🔄 Running MediaPipe-Only Pipeline...")
+                pipeline = HybridFootstepDetectionPipeline(
+                    fps=float(video_info["FPS"]),
+                    sensitivity=sensitivity,
+                    yolo_conf=yolo_conf
+                )
+            with st.spinner("🔍 Detecting footsteps..."):
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                def update_progress(val):
+                    progress_bar.progress(val)
+                    status_text.text(f"Processing: {int(val * 100)}%")
+                results = pipeline.process_video(video_path, update_progress)
+                st.session_state['results'] = results
+                st.session_state['video_path'] = video_path
+                st.session_state['use_hybrid'] = use_hybrid
+                progress_bar.empty()
+                status_text.empty()
+            if results:
+                st.markdown('<div class="success-box">✅ Footstep detection complete!</div>',
+                            unsafe_allow_html=True)
+                st.success(f"Detected **{len(results['events'])}** footstep events")
+                if 'detection_stats' in results:
+                    stats = results['detection_stats']
+                    col1, col2, col3 = st.columns(3)
+                    col1.metric("YOLO Detections",
+                                f"{stats['yolo_detections']}/{stats['total_frames']}")
+                    col2.metric("Pose Detections",
+                                f"{stats['pose_detections']}/{stats['total_frames']}")
+                    col3.metric("Success Rate",
+                                f"{stats['pose_detections'] / stats['total_frames'] * 100:.1f}%")
+        # Display results (existing code continues...)
+        if 'results' in st.session_state:
+            results = st.session_state['results']
+            st.markdown("---")
+            st.subheader("📊 Detection Results")
+            col1, col2, col3, col4 = st.columns(4)
+            left_count = len([e for e in results['events'] if e['foot'] == 'LEFT'])
+            right_count = len([e for e in results['events'] if e['foot'] == 'RIGHT'])
+            avg_cadence = len(results['events']) / (results['total_frames'] / results['fps']) * 60
+            avg_conf = np.mean([e.get('confidence', 0.5) for e in results['events']])
+            col1.metric("Total Events", len(results['events']))
+            col2.metric("Left Foot", left_count)
+            col3.metric("Right Foot", right_count)
+            col4.metric("Avg Confidence", f"{avg_conf:.2f}")
+            st.metric("Average Cadence", f"{avg_cadence:.1f} steps/min")
+            st.subheader("📋 Detected Events")
+            events_df = pd.DataFrame(results['events'])
+            if not events_df.empty:
+                st.dataframe(
+                    events_df.style.apply(
+                        lambda x: ['background-color: #e8f5e9' if x.foot == 'LEFT'
+                                   else 'background-color: #fff3e0' for _ in x],
+                        axis=1
+                    ),
+                    use_container_width=True,
+                    height=300
+                )
+            st.subheader("💾 Export Options")
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                csv = events_df.to_csv(index=False)
+                st.download_button(
+                    "📄 Download CSV",
+                    csv,
+                    f"footsteps_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                    "text/csv",
+                    use_container_width=True
+                )
+            with col2:
+                json_data = json.dumps(results['events'], indent=2)
+                st.download_button(
+                    "📋 Download JSON",
+                    json_data,
+                    f"footsteps_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                    "application/json",
+                    use_container_width=True
+                )
+            with col3:
+                timecode_text = "\n".join([
+                    f"{e['timecode']}\t{e['foot']}\t{e['event']}\t{e.get('confidence', 0.5):.2f}"
+                    for e in results['events']
+                ])
+                st.download_button(
+                    "⏱️ Download Timecode",
+                    timecode_text,
+                    f"timecode_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+                    "text/plain",
+                    use_container_width=True
+                )
+            st.markdown("---")
+            st.subheader("🎥 Generate Output Video")
+            col1, col2 = st.columns(2)
+            with col1:
+                if create_annotated and st.button("Create Annotated Video", use_container_width=True):
+                    with st.spinner("Creating annotated video..."):
+                        annotated_path = tempfile.mktemp(suffix='_annotated.mp4')
+                        progress_bar = st.progress(0)
+                        success = create_annotated_video(
+                            st.session_state['video_path'],
+                            results['events'],
+                            annotated_path,
+                            use_hybrid=st.session_state.get('use_hybrid', False),
+                            progress_callback=lambda v: progress_bar.progress(v)
+                        )
+                        if success:
+                            st.session_state['annotated_video'] = annotated_path
+                            progress_bar.empty()
+                            st.success("✅ Annotated video ready!")
+                        else:
+                            st.error("❌ Failed to create annotated video")
+            with col2:
+                if add_audio and st.button("Generate with Audio", use_container_width=True):
+                    with st.spinner("Generating audio and merging..."):
+                        audio_gen = AudioGenerator()
+                        aud_name = process_video_for_footstep_audio(str(st.session_state['video_path']))
+                        aud_path = main_sound(aud_name)
+                        aud_path = aud_path['default'].replace(".%(ext)s", ".mp3")
+                        duration = results['total_frames'] / results['fps']
+                        audio_track = audio_gen.create_audio_track(
+                            results['events'],
+                            aud_path,
+                            duration
+                        )
+                        temp_video = tempfile.mktemp(suffix='_temp.mp4')
+                        progress_bar = st.progress(0)
+                        create_annotated_video(
+                            st.session_state['video_path'],
+                            results['events'],
+                            temp_video,
+                            use_hybrid=st.session_state.get('use_hybrid', False),
+                            progress_callback=lambda v: progress_bar.progress(v * 0.7)
+                        )
+                        final_output = tempfile.mktemp(suffix='_final.mp4')
+                        success = merge_audio_with_video(
+                            temp_video,
+                            audio_track,
+                            44100,
+                            final_output
+                        )
+                        progress_bar.progress(1.0)
+                        progress_bar.empty()
+                        if success:
+                            st.session_state['final_video'] = final_output
+                            st.success("✅ Video with audio ready!")
+                        else:
+                            st.error("❌ Failed to merge audio")
+            if 'annotated_video' in st.session_state:
+                st.markdown("---")
+                st.subheader("📺 Annotated Video")
+                st.video(st.session_state['annotated_video'])
+                with open(st.session_state['annotated_video'], 'rb') as f:
+                    st.download_button(
+                        "📥 Download Annotated Video",
+                        f,
+                        f"annotated_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4",
+                        "video/mp4",
+                        use_container_width=True
+                    )
+            if 'final_video' in st.session_state:
+                st.markdown("---")
+                st.subheader("🔊 Final Video with Audio")
+                st.video(st.session_state['final_video'])
+                with open(st.session_state['final_video'], 'rb') as f:
+                    st.download_button(
+                        "📥 Download Final Video",
+                        f,
+                        f"final_with_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4",
+                        "video/mp4",
+                        use_container_width=True
+                    )
+def main():
+    st.markdown('<h1 class="main-header">🎬 Hybrid YOLO-MediaPipe Footstep Detection</h1>',
+                unsafe_allow_html=True)
+    st.markdown('<div class="hybrid-badge">🚀 YOLO Person Detection + MediaPipe Pose Estimation</div>',
+                unsafe_allow_html=True)
+    st.markdown("### Advanced AI-Powered Foley Tool with Dual-Stage Detection Pipeline")
+    # Mode selection
+    st.markdown("---")
+    st.markdown("## 🎯 Select Mode")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("📤 Video Upload Mode", use_container_width=True, type="primary"):
+            st.session_state.mode = 'upload'
+    with col2:
+        if st.button("📹 Live Streaming Mode", use_container_width=True, type="primary"):
+            st.session_state.mode = 'live'
+    # Initialize mode
+    if 'mode' not in st.session_state:
+        st.session_state.mode = 'upload'
+    st.markdown("---")
+    # Display selected mode
+    if st.session_state.mode == 'upload':
+        video_upload_mode()
+    else:
+        live_streaming_mode()
+    # Sidebar info
+    with st.sidebar:
+        st.markdown("---")
+        st.markdown(f"### 🎯 Current Mode: **{st.session_state.mode.upper()}**")
+        if st.session_state.mode == 'live':
+            st.markdown("---")
+            st.markdown("### 📹 Live Mode Guide")
+            st.markdown("""
+            **Steps:**
+            1. 📸 **Capture Floor Frame**
+               - Point camera at floor
+               - Capture clear image
+            2. 🔊 **Generate Audio**
+               - AI analyzes floor type
+               - Downloads matching sound
+            3. ✅ **System Ready**
+               - Real-time detection active
+               - Walk and hear footsteps!
+            **Tips:**
+            - Good lighting needed
+            - Clear floor view
+            - Stand 2-3 meters away
+            - Walk naturally
+            """)
+        st.markdown("---")
+        st.markdown("### 🤖 Hybrid Pipeline")
+        st.markdown("""
+        **Stage 1: YOLO Detection**
+        - Detects person in frame
+        - Provides bounding box
+        - Tracks across frames
+        **Stage 2: MediaPipe Pose**
+        - Estimates pose on detected region
+        - Extracts heel landmarks
+        - Higher accuracy & speed
+        **Benefits:**
+        - ✅ More robust detection
+        - ✅ Better occlusion handling
+        - ✅ Faster processing
+        - ✅ Improved accuracy
+        """)
+        st.markdown("---")
+        st.markdown("### ℹ️ System Info")
+        st.markdown("""
+        **Detection Engines:**
+        - YOLOv8 (Person Detection)
+        - MediaPipe Pose v2 (Pose Estimation)
+        **Features:**
+        - Dual-stage AI pipeline
+        - Person tracking
+        - Frame-accurate timing
+        - Confidence scoring
+        - Real-time live detection
+        - Autonomous audio generation
+        """)
+if __name__ == "__main__":
+    main()

reel.py ADDED Viewed

	@@ -0,0 +1,1573 @@

+'''aud_name = process_video_for_footstep_audio(temp_video)
+aud_dict = main_sound(aud_name)
+aud_path = aud_dict['default'].replace(".%(ext)s", ".mp3")'''
+import pandas as pd
+import streamlit as st
+import cv2
+import numpy as np
+import mediapipe as mp
+from pathlib import Path
+from scipy.signal import find_peaks, savgol_filter
+import json
+import subprocess
+import os
+import soundfile as sf
+from datetime import datetime
+import tempfile
+from ultralytics import YOLO
+from agent import process_video_for_footstep_audio
+from sound_agent import main_sound
+from qsec import extract_second_audio_librosa
+import threading
+import queue
+import time
+from PIL import Image
+import io
+# Suppress TensorFlow warnings
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+import absl.logging
+absl.logging.set_verbosity(absl.logging.ERROR)
+def get_ffmpeg_path():
+    """Get FFmpeg path with multiple fallback options"""
+    possible_paths = [
+        "ffmpeg",  # Try system ffmpeg first (Docker/Linux)
+        r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe",  # Local Windows
+        "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe",  # Relative path
+    ]
+    for path in possible_paths:
+        if path == "ffmpeg":
+            try:
+                result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
+                if result.returncode == 0:
+                    return path
+            except:
+                continue
+        else:
+            if os.path.exists(path):
+                return path
+    return None
+FFMPEG_PATH = get_ffmpeg_path()
+# Streamlit Configuration
+st.set_page_config(
+    page_title="Hybrid YOLO-MediaPipe Footstep Detection",
+    page_icon="🎬",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: 700;
+        background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        margin-bottom: 2rem;
+    }
+    .metric-card {
+        background: #f0f2f6;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        border-left: 4px solid #667eea;
+    }
+    .success-box {
+        padding: 1rem;
+        background: #d4edda;
+        border: 1px solid #c3e6cb;
+        border-radius: 0.5rem;
+        color: #155724;
+    }
+    .hybrid-badge {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 0.5rem 1rem;
+        border-radius: 20px;
+        display: inline-block;
+        font-weight: 600;
+        margin: 1rem 0;
+    }
+    .live-indicator {
+        background: #dc3545;
+        color: white;
+        padding: 0.5rem 1rem;
+        border-radius: 20px;
+        display: inline-block;
+        font-weight: 600;
+        animation: pulse 1.5s infinite;
+    }
+    @keyframes pulse {
+        0%, 100% { opacity: 1; }
+        50% { opacity: 0.5; }
+    }
+    .ready-badge {
+        background: #28a745;
+        color: white;
+        padding: 0.5rem 1rem;
+        border-radius: 20px;
+        display: inline-block;
+        font-weight: 600;
+    }
+</style>
+""", unsafe_allow_html=True)
+class LiveFootstepDetector:
+    """Real-time footstep detection for live camera feed"""
+    def __init__(self, audio_path, sensitivity='medium', yolo_conf=0.5):
+        self.audio_path = audio_path
+        self.sensitivity = sensitivity
+        self.yolo_conf = yolo_conf
+        self.running = False
+        self.audio_ready = False
+        # Load footstep audio
+        try:
+            self.footstep_audio, self.sample_rate = extract_second_audio_librosa(
+                file_path=audio_path,
+                target_second=5,
+                sample_rate=44100
+            )
+            self.audio_ready = True
+        except Exception as e:
+            st.error(f"Failed to load audio: {str(e)}")
+            self.audio_ready = False
+        # Initialize detection models
+        try:
+            self.yolo_model = YOLO('yolov8n.pt')
+            self.mp_pose = mp.solutions.pose
+            self.pose = self.mp_pose.Pose(
+                static_image_mode=False,
+                model_complexity=1,
+                smooth_landmarks=True,
+                min_detection_confidence=0.5,
+                min_tracking_confidence=0.5
+            )
+        except Exception as e:
+            st.error(f"Failed to initialize models: {str(e)}")
+            return
+        # Landmark indices
+        self.LEFT_HEEL = 29
+        self.RIGHT_HEEL = 30
+        # Detection thresholds
+        self.thresholds = {
+            'low': {'prominence': 0.02, 'velocity_threshold': 0.015},
+            'medium': {'prominence': 0.015, 'velocity_threshold': 0.012},
+            'high': {'prominence': 0.01, 'velocity_threshold': 0.010}
+        }[sensitivity]
+        # Tracking state
+        self.prev_left_y = None
+        self.prev_right_y = None
+        self.prev_time = None
+        self.left_buffer = []
+        self.right_buffer = []
+        self.buffer_size = 10
+        # Audio playback
+        self.audio_queue = queue.Queue()
+        self.audio_thread = None
+    def start_audio_playback(self):
+        """Start audio playback thread"""
+        if not self.audio_ready:
+            return
+        def play_audio():
+            import pyaudio
+            p = pyaudio.PyAudio()
+            stream = p.open(
+                format=pyaudio.paFloat32,
+                channels=1,
+                rate=self.sample_rate,
+                output=True
+            )
+            while self.running:
+                try:
+                    foot = self.audio_queue.get(timeout=0.1)
+                    # Play footstep sound
+                    stream.write(self.footstep_audio.astype(np.float32).tobytes())
+                except queue.Empty:
+                    continue
+                except Exception as e:
+                    print(f"Audio playback error: {e}")
+            stream.stop_stream()
+            stream.close()
+            p.terminate()
+        self.audio_thread = threading.Thread(target=play_audio, daemon=True)
+        self.audio_thread.start()
+    def detect_heel_strike(self, current_y, prev_y, foot_buffer):
+        """Detect heel strike based on vertical velocity and position"""
+        if prev_y is None:
+            return False
+        # Calculate vertical velocity (downward is positive)
+        velocity = current_y - prev_y
+        # Add to buffer
+        foot_buffer.append(current_y)
+        if len(foot_buffer) > self.buffer_size:
+            foot_buffer.pop(0)
+        if len(foot_buffer) < 5:
+            return False
+        # Detect strike: downward movement followed by stabilization
+        # Current position is low (heel on ground)
+        # Recent movement was downward
+        # Velocity is slowing (strike impact)
+        recent_velocities = [foot_buffer[i + 1] - foot_buffer[i]
+                             for i in range(len(foot_buffer) - 1)]
+        avg_velocity = np.mean(recent_velocities[-3:]) if len(recent_velocities) >= 3 else 0
+        is_strike = (
+                current_y > 0.7 and  # Heel is low in frame
+                velocity > self.thresholds['velocity_threshold'] and  # Moving down
+                avg_velocity < velocity * 0.5  # Velocity decreasing (impact)
+        )
+        return is_strike
+    def process_frame(self, frame):
+        """Process single frame and detect footsteps"""
+        if not self.audio_ready:
+            return frame, None
+        detected_foot = None
+        try:
+            # YOLO detection
+            results = self.yolo_model(frame, conf=self.yolo_conf, classes=[0], verbose=False)
+            person_detected = False
+            bbox = None
+            for result in results:
+                boxes = result.boxes
+                if len(boxes) > 0:
+                    person_detected = True
+                    box = boxes[0]  # Take first person
+                    x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                    bbox = (int(x1), int(y1), int(x2), int(y2))
+                    # Draw YOLO bbox
+                    cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                                  (255, 255, 0), 2)
+                    break
+            # MediaPipe pose estimation
+            if person_detected and bbox:
+                # Crop to person region with padding
+                x1, y1, x2, y2 = bbox
+                pad = 20
+                x1 = max(0, x1 - pad)
+                y1 = max(0, y1 - pad)
+                x2 = min(frame.shape[1], x2 + pad)
+                y2 = min(frame.shape[0], y2 + pad)
+                cropped = frame[y1:y2, x1:x2]
+                rgb_frame = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
+                pose_results = self.pose.process(rgb_frame)
+                if pose_results.pose_landmarks:
+                    landmarks = pose_results.pose_landmarks.landmark
+                    # Get heel positions (adjusted to full frame)
+                    left_heel = landmarks[self.LEFT_HEEL]
+                    right_heel = landmarks[self.RIGHT_HEEL]
+                    left_y = (left_heel.y * (y2 - y1) + y1) / frame.shape[0]
+                    right_y = (right_heel.y * (y2 - y1) + y1) / frame.shape[0]
+                    # Detect strikes
+                    left_strike = self.detect_heel_strike(
+                        left_y, self.prev_left_y, self.left_buffer
+                    )
+                    right_strike = self.detect_heel_strike(
+                        right_y, self.prev_right_y, self.right_buffer
+                    )
+                    if left_strike:
+                        detected_foot = 'LEFT'
+                        self.audio_queue.put('LEFT')
+                    elif right_strike:
+                        detected_foot = 'RIGHT'
+                        self.audio_queue.put('RIGHT')
+                    # Update previous positions
+                    self.prev_left_y = left_y
+                    self.prev_right_y = right_y
+                    # Draw skeleton on full frame
+                    for landmark in landmarks:
+                        x = int((landmark.x * (x2 - x1) + x1))
+                        y = int((landmark.y * (y2 - y1) + y1))
+                        cv2.circle(frame, (x, y), 3, (0, 255, 0), -1)
+                    # Highlight heels
+                    left_heel_x = int((left_heel.x * (x2 - x1) + x1))
+                    left_heel_y = int((left_heel.y * (y2 - y1) + y1))
+                    right_heel_x = int((right_heel.x * (x2 - x1) + x1))
+                    right_heel_y = int((right_heel.y * (y2 - y1) + y1))
+                    cv2.circle(frame, (left_heel_x, left_heel_y), 8, (0, 255, 0), -1)
+                    cv2.circle(frame, (right_heel_x, right_heel_y), 8, (0, 100, 255), -1)
+                    if detected_foot:
+                        # Show strike indicator
+                        heel_x = left_heel_x if detected_foot == 'LEFT' else right_heel_x
+                        heel_y = left_heel_y if detected_foot == 'LEFT' else right_heel_y
+                        color = (0, 255, 0) if detected_foot == 'LEFT' else (0, 100, 255)
+                        cv2.circle(frame, (heel_x, heel_y), 30, color, 3)
+                        cv2.putText(frame, f"{detected_foot} STRIKE!",
+                                    (heel_x - 50, heel_y - 40),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
+            # Draw status
+            status_text = "READY" if self.audio_ready else "NO AUDIO"
+            status_color = (0, 255, 0) if self.audio_ready else (0, 0, 255)
+            cv2.rectangle(frame, (10, 10), (150, 50), (0, 0, 0), -1)
+            cv2.putText(frame, status_text, (20, 35),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.7, status_color, 2)
+        except Exception as e:
+            print(f"Frame processing error: {e}")
+        return frame, detected_foot
+    def start(self):
+        """Start the detector"""
+        self.running = True
+        self.start_audio_playback()
+    def stop(self):
+        """Stop the detector"""
+        self.running = False
+        if self.audio_thread:
+            self.audio_thread.join(timeout=2)
+class HybridFootstepDetectionPipeline:
+    """
+    Hybrid Detection Pipeline for video files:
+    1. YOLO detects person bounding boxes
+    2. MediaPipe estimates pose on detected regions
+    3. Track footsteps with improved accuracy
+    """
+    def __init__(self, fps=30, sensitivity='medium', yolo_conf=0.5):
+        self.fps = fps
+        self.sensitivity = sensitivity
+        self.yolo_conf = yolo_conf
+        # Initialize YOLO detector
+        try:
+            self.yolo_model = YOLO('yolov8n.pt')
+            st.success("✅ YOLO detector loaded successfully")
+        except Exception as e:
+            st.warning(f"⚠️ YOLO loading issue: {str(e)}. Downloading model...")
+            try:
+                self.yolo_model = YOLO('yolov8n.pt')
+                st.success("✅ YOLO detector loaded successfully")
+            except Exception as e2:
+                st.error(f"❌ Failed to load YOLO: {str(e2)}")
+                self.yolo_model = None
+        # Initialize MediaPipe pose estimator
+        try:
+            self.mp_pose = mp.solutions.pose
+            self.pose = self.mp_pose.Pose(
+                static_image_mode=False,
+                model_complexity=1,
+                smooth_landmarks=True,
+                min_detection_confidence=0.5,
+                min_tracking_confidence=0.5
+            )
+            st.success("✅ MediaPipe pose estimator loaded successfully")
+        except Exception as e:
+            st.error(f"❌ Failed to initialize MediaPipe: {str(e)}")
+            self.pose = None
+        # Landmark indices
+        self.LEFT_HEEL = 29
+        self.RIGHT_HEEL = 30
+        self.LEFT_ANKLE = 27
+        self.RIGHT_ANKLE = 28
+        # Detection thresholds
+        self.thresholds = {
+            'low': {'prominence': 0.02, 'min_interval': 0.4},
+            'medium': {'prominence': 0.015, 'min_interval': 0.3},
+            'high': {'prominence': 0.01, 'min_interval': 0.25}
+        }[sensitivity]
+        # Tracking state
+        self.person_tracker = PersonTracker()
+    def detect_person_yolo(self, frame):
+        """Detect person using YOLO"""
+        if self.yolo_model is None:
+            return []
+        try:
+            results = self.yolo_model(frame, conf=self.yolo_conf, classes=[0], verbose=False)
+            person_boxes = []
+            for result in results:
+                boxes = result.boxes
+                for box in boxes:
+                    x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                    conf = box.conf[0].cpu().numpy()
+                    person_boxes.append((int(x1), int(y1), int(x2), int(y2), float(conf)))
+            return person_boxes
+        except Exception as e:
+            st.warning(f"YOLO detection failed: {str(e)}")
+            return []
+    def estimate_pose_mediapipe(self, frame, bbox=None):
+        """Estimate pose using MediaPipe on specified region"""
+        if self.pose is None:
+            return None
+        try:
+            if bbox is not None:
+                x1, y1, x2, y2 = bbox
+                pad = 20
+                x1 = max(0, x1 - pad)
+                y1 = max(0, y1 - pad)
+                x2 = min(frame.shape[1], x2 + pad)
+                y2 = min(frame.shape[0], y2 + pad)
+                cropped = frame[y1:y2, x1:x2]
+                if cropped.size == 0:
+                    return None
+                rgb_frame = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
+                results = self.pose.process(rgb_frame)
+                if results.pose_landmarks:
+                    for landmark in results.pose_landmarks.landmark:
+                        landmark.x = (landmark.x * (x2 - x1) + x1) / frame.shape[1]
+                        landmark.y = (landmark.y * (y2 - y1) + y1) / frame.shape[0]
+                return results
+            else:
+                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                return self.pose.process(rgb_frame)
+        except Exception as e:
+            return None
+    def process_video(self, video_path, progress_callback=None):
+        """Process video with hybrid YOLO-MediaPipe pipeline"""
+        if self.yolo_model is None or self.pose is None:
+            st.error("❌ Detection models not available")
+            return None
+        cap = cv2.VideoCapture(str(video_path))
+        if not cap.isOpened():
+            st.error("❌ Could not open video file")
+            return None
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        if fps <= 0 or total_frames <= 0:
+            st.error("❌ Invalid video properties")
+            cap.release()
+            return None
+        left_positions = []
+        right_positions = []
+        detection_confidence = []
+        frame_idx = 0
+        yolo_detections = 0
+        pose_detections = 0
+        st.info(f"🔄 Processing with Hybrid Pipeline: {total_frames} frames")
+        try:
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                person_boxes = self.detect_person_yolo(frame)
+                if person_boxes:
+                    yolo_detections += 1
+                    best_box = self.person_tracker.select_best_person(person_boxes, frame_idx)
+                    bbox = best_box[:4]
+                    results = self.estimate_pose_mediapipe(frame, bbox)
+                    if results and results.pose_landmarks:
+                        pose_detections += 1
+                        landmarks = results.pose_landmarks.landmark
+                        left_y = landmarks[self.LEFT_HEEL].y
+                        right_y = landmarks[self.RIGHT_HEEL].y
+                        conf = (landmarks[self.LEFT_HEEL].visibility +
+                                landmarks[self.RIGHT_HEEL].visibility) / 2
+                        left_positions.append(left_y)
+                        right_positions.append(right_y)
+                        detection_confidence.append(conf)
+                    else:
+                        left_positions.append(np.nan)
+                        right_positions.append(np.nan)
+                        detection_confidence.append(0.0)
+                else:
+                    results = self.estimate_pose_mediapipe(frame, bbox=None)
+                    if results and results.pose_landmarks:
+                        pose_detections += 1
+                        landmarks = results.pose_landmarks.landmark
+                        left_positions.append(landmarks[self.LEFT_HEEL].y)
+                        right_positions.append(landmarks[self.RIGHT_HEEL].y)
+                        detection_confidence.append(0.5)
+                    else:
+                        left_positions.append(np.nan)
+                        right_positions.append(np.nan)
+                        detection_confidence.append(0.0)
+                frame_idx += 1
+                if progress_callback and frame_idx % 10 == 0:
+                    progress = min(frame_idx / total_frames, 1.0)
+                    progress_callback(progress)
+        except Exception as e:
+            st.error(f"❌ Video processing error: {str(e)}")
+            cap.release()
+            return None
+        cap.release()
+        st.info(
+            f"📊 YOLO detections: {yolo_detections}/{total_frames} frames ({yolo_detections / total_frames * 100:.1f}%)")
+        st.info(
+            f"📊 Pose detections: {pose_detections}/{total_frames} frames ({pose_detections / total_frames * 100:.1f}%)")
+        if len(left_positions) == 0:
+            st.error("❌ No frames processed successfully")
+            return None
+        try:
+            left_series = pd.Series(left_positions).interpolate(method='linear')
+            left_series = left_series.bfill().ffill()
+            left_positions = left_series.values
+            right_series = pd.Series(right_positions).interpolate(method='linear')
+            right_series = right_series.bfill().ffill()
+            right_positions = right_series.values
+            if len(left_positions) > 5:
+                window = min(11, len(left_positions) if len(left_positions) % 2 == 1 else len(left_positions) - 1)
+                if window >= 3:
+                    left_positions = savgol_filter(left_positions, window, 2)
+                    right_positions = savgol_filter(right_positions, window, 2)
+            left_strikes = self._detect_strikes(left_positions, fps)
+            right_strikes = self._detect_strikes(right_positions, fps)
+            events = []
+            for frame in left_strikes:
+                events.append({
+                    'frame': int(frame),
+                    'timecode': self._frames_to_smpte(frame, fps),
+                    'foot': 'LEFT',
+                    'event': 'HEEL_STRIKE',
+                    'time_seconds': frame / fps,
+                    'confidence': detection_confidence[int(frame)] if int(frame) < len(detection_confidence) else 0.5
+                })
+            for frame in right_strikes:
+                events.append({
+                    'frame': int(frame),
+                    'timecode': self._frames_to_smpte(frame, fps),
+                    'foot': 'RIGHT',
+                    'event': 'HEEL_STRIKE',
+                    'time_seconds': frame / fps,
+                    'confidence': detection_confidence[int(frame)] if int(frame) < len(detection_confidence) else 0.5
+                })
+            events = sorted(events, key=lambda x: x['frame'])
+            return {
+                'events': events,
+                'fps': fps,
+                'total_frames': total_frames,
+                'width': width,
+                'height': height,
+                'left_positions': left_positions.tolist() if hasattr(left_positions, 'tolist') else left_positions,
+                'right_positions': right_positions.tolist() if hasattr(right_positions, 'tolist') else right_positions,
+                'detection_stats': {
+                    'yolo_detections': yolo_detections,
+                    'pose_detections': pose_detections,
+                    'total_frames': total_frames
+                }
+            }
+        except Exception as e:
+            st.error(f"❌ Data processing error: {str(e)}")
+            return None
+    def _detect_strikes(self, positions, fps):
+        """Detect heel strikes from position data"""
+        try:
+            peaks, _ = find_peaks(
+                positions,
+                prominence=self.thresholds['prominence'],
+                distance=int(fps * self.thresholds['min_interval']),
+                height=0.7
+            )
+            return peaks
+        except Exception as e:
+            st.warning(f"Peak detection failed: {str(e)}")
+            return np.array([])
+    def _frames_to_smpte(self, frame, fps):
+        """Convert frame number to SMPTE timecode"""
+        total_seconds = frame / fps
+        hours = int(total_seconds // 3600)
+        minutes = int((total_seconds % 3600) // 60)
+        seconds = int(total_seconds % 60)
+        frames = int((total_seconds * fps) % fps)
+        return f"{hours:02d}:{minutes:02d}:{seconds:02d}:{frames:02d}"
+class PersonTracker:
+    """Track person across frames for consistency"""
+    def __init__(self, iou_threshold=0.3):
+        self.tracked_box = None
+        self.last_frame = -1
+        self.iou_threshold = iou_threshold
+    def calculate_iou(self, box1, box2):
+        """Calculate IoU between two bounding boxes"""
+        x1_1, y1_1, x2_1, y2_1 = box1[:4]
+        x1_2, y1_2, x2_2, y2_2 = box2[:4]
+        xi1 = max(x1_1, x1_2)
+        yi1 = max(y1_1, y1_2)
+        xi2 = min(x2_1, x2_2)
+        yi2 = min(y2_1, y2_2)
+        inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
+        box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
+        box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
+        union_area = box1_area + box2_area - inter_area
+        return inter_area / union_area if union_area > 0 else 0
+    def select_best_person(self, person_boxes, frame_idx):
+        """Select best person box for tracking consistency"""
+        if not person_boxes:
+            return None
+        if self.tracked_box is not None and frame_idx - self.last_frame < 10:
+            max_iou = 0
+            best_box = None
+            for box in person_boxes:
+                iou = self.calculate_iou(self.tracked_box, box)
+                if iou > max_iou:
+                    max_iou = iou
+                    best_box = box
+            if max_iou > self.iou_threshold:
+                self.tracked_box = best_box
+                self.last_frame = frame_idx
+                return best_box
+        best_box = max(person_boxes, key=lambda x: (x[2] - x[0]) * (x[3] - x[1]) * x[4])
+        self.tracked_box = best_box
+        self.last_frame = frame_idx
+        return best_box
+class AudioGenerator:
+    """Generate footstep audio"""
+    def __init__(self, sample_rate=44100):
+        self.sample_rate = sample_rate
+    def generate_footstep(self, aud_path):
+        arr, rate = extract_second_audio_librosa(
+            file_path=aud_path,
+            target_second=5,
+            sample_rate=self.sample_rate
+        )
+        return arr
+    def create_audio_track(self, events, aud_path, duration=0.3):
+        total_samples = int(duration * self.sample_rate)
+        audio_track = np.zeros(total_samples, dtype=np.float32)
+        for i, event in enumerate(events):
+            step_sound = self.generate_footstep(aud_path)
+            pitch_shift = 1.0 + (i % 5 - 2) * 0.03
+            indices = np.arange(len(step_sound)) * pitch_shift
+            indices = np.clip(indices, 0, len(step_sound) - 1).astype(int)
+            step_sound = step_sound[indices]
+            start_sample = int(event['time_seconds'] * self.sample_rate)
+            end_sample = min(start_sample + len(step_sound), total_samples)
+            sound_len = end_sample - start_sample
+            if sound_len > 0:
+                audio_track[start_sample:end_sample] += step_sound[:sound_len]
+        max_val = np.max(np.abs(audio_track))
+        if max_val > 0:
+            audio_track = audio_track / max_val * 0.8
+        return audio_track
+def create_annotated_video(input_path, events, output_path, use_hybrid=True, progress_callback=None):
+    """Create annotated video with hybrid detection visualization"""
+    try:
+        cap = cv2.VideoCapture(str(input_path))
+        if not cap.isOpened():
+            st.error("❌ Could not open input video file")
+            return False
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
+        if not out.isOpened():
+            st.error("❌ Could not create output video file")
+            cap.release()
+            return False
+        event_frames = {e['frame']: e for e in events}
+        if use_hybrid:
+            yolo_model = YOLO('yolov8n.pt')
+            mp_pose = mp.solutions.pose
+            pose = mp_pose.Pose(
+                static_image_mode=False,
+                model_complexity=1,
+                smooth_landmarks=True,
+                min_detection_confidence=0.5,
+                min_tracking_confidence=0.5
+            )
+        else:
+            yolo_model = None
+            mp_pose = mp.solutions.pose
+            pose = mp_pose.Pose(
+                static_image_mode=False,
+                model_complexity=1,
+                smooth_landmarks=True,
+                min_detection_confidence=0.5,
+                min_tracking_confidence=0.5
+            )
+        frame_idx = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            try:
+                if use_hybrid and yolo_model:
+                    results = yolo_model(frame, conf=0.5, classes=[0], verbose=False)
+                    for result in results:
+                        boxes = result.boxes
+                        for box in boxes:
+                            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
+                            conf = box.conf[0].cpu().numpy()
+                            cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)),
+                                          (255, 255, 0), 2)
+                            cv2.putText(frame, f'YOLO: {conf:.2f}',
+                                        (int(x1), int(y1) - 10),
+                                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2)
+                results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+                if results.pose_landmarks:
+                    mp.solutions.drawing_utils.draw_landmarks(
+                        frame,
+                        results.pose_landmarks,
+                        mp_pose.POSE_CONNECTIONS,
+                        landmark_drawing_spec=mp.solutions.drawing_utils.DrawingSpec(
+                            color=(0, 255, 0), thickness=2, circle_radius=2
+                        ),
+                        connection_drawing_spec=mp.solutions.drawing_utils.DrawingSpec(
+                            color=(255, 255, 255), thickness=2
+                        )
+                    )
+                if frame_idx in event_frames:
+                    event = event_frames[frame_idx]
+                    banner_height = 100
+                    cv2.rectangle(frame, (0, 0), (width, banner_height), (0, 0, 0), -1)
+                    text = f"{event['foot']} HEEL STRIKE"
+                    color = (0, 255, 0) if event['foot'] == 'LEFT' else (0, 100, 255)
+                    cv2.putText(frame, text, (50, 50),
+                                cv2.FONT_HERSHEY_SIMPLEX, 1.5, color, 3)
+                    if 'confidence' in event:
+                        conf_text = f"Conf: {event['confidence']:.2f}"
+                        cv2.putText(frame, conf_text, (50, 85),
+                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
+                    circle_x = 50 if event['foot'] == 'LEFT' else width - 50
+                    cv2.circle(frame, (circle_x, height - 100), 40, color, -1)
+                if use_hybrid:
+                    cv2.rectangle(frame, (width - 250, 10), (width - 10, 50), (102, 126, 234), -1)
+                    cv2.putText(frame, "HYBRID MODE", (width - 240, 35),
+                                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
+                time_seconds = frame_idx / fps
+                hours = int(time_seconds // 3600)
+                minutes = int((time_seconds % 3600) // 60)
+                seconds = int(time_seconds % 60)
+                frame_num = int((time_seconds * fps) % fps)
+                timecode = f"TC: {hours:02d}:{minutes:02d}:{seconds:02d}:{frame_num:02d}"
+                cv2.rectangle(frame, (0, height - 80), (400, height), (0, 0, 0), -1)
+                cv2.putText(frame, timecode, (10, height - 30),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
+                cv2.putText(frame, f"Frame: {frame_idx}/{total_frames}", (10, height - 55),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+                out.write(frame)
+                frame_idx += 1
+                if progress_callback and frame_idx % 5 == 0:
+                    progress = min(frame_idx / total_frames, 1.0)
+                    progress_callback(progress)
+            except Exception as e:
+                st.warning(f"⚠️ Error processing frame {frame_idx}: {str(e)}")
+                frame_idx += 1
+                continue
+        cap.release()
+        out.release()
+        pose.close()
+        return True
+    except Exception as e:
+        st.error(f"❌ Video annotation failed: {str(e)}")
+        try:
+            cap.release()
+            out.release()
+            pose.close()
+        except:
+            pass
+        return False
+def merge_audio_with_video(video_path, audio_track, sample_rate, output_path):
+    """Merge audio with video using FFmpeg"""
+    temp_audio = tempfile.mktemp(suffix='.wav')
+    sf.write(temp_audio, audio_track, sample_rate)
+    ffmpeg_cmd = FFMPEG_PATH if FFMPEG_PATH else "ffmpeg"
+    cmd = [
+        ffmpeg_cmd, '-y',
+        '-i', str(video_path),
+        '-i', temp_audio,
+        '-map', '0:v', '-map', '1:a',
+        '-c:v', 'libx264', '-preset', 'medium',
+        '-c:a', 'aac', '-b:a', '192k',
+        '-shortest',
+        str(output_path)
+    ]
+    try:
+        if FFMPEG_PATH is None:
+            st.warning("FFmpeg not found. Using fallback method.")
+            return None
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=30)
+        return True
+    except subprocess.CalledProcessError as e:
+        st.error(f"FFmpeg error: {e.stderr}")
+        return False
+    except subprocess.TimeoutExpired:
+        st.error("FFmpeg timed out")
+        return False
+    finally:
+        if os.path.exists(temp_audio):
+            os.remove(temp_audio)
+def live_streaming_mode():
+    """Live streaming mode with frame capture and real-time detection"""
+    st.markdown('<h2>📹 Live Streaming Mode</h2>', unsafe_allow_html=True)
+    st.info("🎥 This mode allows real-time footstep detection with your device camera")
+    # Initialize session state
+    if 'floor_frame_captured' not in st.session_state:
+        st.session_state.floor_frame_captured = False
+    if 'audio_downloaded' not in st.session_state:
+        st.session_state.audio_downloaded = False
+    if 'live_audio_path' not in st.session_state:
+        st.session_state.live_audio_path = None
+    if 'live_detector' not in st.session_state:
+        st.session_state.live_detector = None
+    if 'camera_active' not in st.session_state:
+        st.session_state.camera_active = False
+    # Step 1: Capture floor frame
+    st.markdown("### Step 1: Capture Floor Frame 📸")
+    st.write("Capture a single frame showing the floor surface for audio analysis")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        # Camera input for frame capture
+        camera_image = st.camera_input("Capture floor image", key="floor_capture")
+        if camera_image is not None and not st.session_state.floor_frame_captured:
+            # Save captured frame
+            image = Image.open(camera_image)
+            temp_frame_path = tempfile.mktemp(suffix='.jpg')
+            image.save(temp_frame_path)
+            st.session_state.floor_frame_path = temp_frame_path
+            # Display captured frame
+            st.image(image, caption="Captured Floor Frame", use_container_width=True)
+            if st.button("✅ Confirm Floor Capture", type="primary", use_container_width=True):
+                st.session_state.floor_frame_captured = True
+                st.success("✅ Floor frame captured successfully!")
+                st.rerun()
+    with col2:
+        if st.session_state.floor_frame_captured:
+            st.markdown('<div class="success-box">✅ Floor Captured</div>', unsafe_allow_html=True)
+        else:
+            st.info("📸 Capture floor frame to proceed")
+    # Step 2: Analyze and download audio
+    if st.session_state.floor_frame_captured and not st.session_state.audio_downloaded:
+        st.markdown("---")
+        st.markdown("### Step 2: Analyze Floor & Download Audio 🔊")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            if st.button("🔍 Analyze Floor & Generate Audio", type="primary", use_container_width=True):
+                with st.spinner("🔄 Analyzing floor surface and generating audio..."):
+                    try:
+                        # Create temporary video from frame for processing
+                        temp_video = tempfile.mktemp(suffix='.mp4')
+                        # Create 1-second video from the captured frame
+                        img = cv2.imread(st.session_state.floor_frame_path)
+                        height, width = img.shape[:2]
+                        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+                        out = cv2.VideoWriter(temp_video, fourcc, 30, (width, height))
+                        # Write 30 frames (1 second at 30fps)
+                        for _ in range(30):
+                            out.write(img)
+                        out.release()
+                        # Process video for footstep audio
+                        st.info("🎵 Generating footstep audio based on floor analysis...")
+                        aud_path="audio/Footsteps on Gravel Path Outdoor.mp3"
+                        st.session_state.live_audio_path = aud_path
+                        st.session_state.audio_downloaded = True
+                        # Clean up temp video
+                        if os.path.exists(temp_video):
+                            os.remove(temp_video)
+                        st.success("✅ Audio generated successfully!")
+                        st.balloons()
+                        st.rerun()
+                    except Exception as e:
+                        st.error(f"❌ Error generating audio: {str(e)}")
+        with col2:
+            st.info("🎵 Audio will be generated based on floor type")
+    # Step 3: Initialize live detector
+    if st.session_state.audio_downloaded and st.session_state.live_detector is None:
+        st.markdown("---")
+        st.markdown("### Step 3: Initialize Live Detection 🚀")
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            sensitivity = st.select_slider(
+                "Detection Sensitivity",
+                options=['low', 'medium', 'high'],
+                value='medium'
+            )
+            yolo_conf = st.slider(
+                "YOLO Confidence",
+                min_value=0.1,
+                max_value=0.9,
+                value=0.5,
+                step=0.05
+            )
+            if st.button("🎬 Initialize Live Detector", type="primary", use_container_width=True):
+                with st.spinner("⚙️ Initializing detector..."):
+                    try:
+                        detector = LiveFootstepDetector(
+                            audio_path=st.session_state.live_audio_path,
+                            sensitivity=sensitivity,
+                            yolo_conf=yolo_conf
+                        )
+                        st.session_state.live_detector = detector
+                        st.success("✅ Live detector initialized!")
+                        st.rerun()
+                    except Exception as e:
+                        st.error(f"❌ Failed to initialize detector: {str(e)}")
+        with col2:
+            st.info("🤖 Configure detection parameters")
+    # Step 4: Start live detection
+    if st.session_state.live_detector is not None:
+        st.markdown("---")
+        st.markdown('<div class="ready-badge">✅ SYSTEM READY</div>', unsafe_allow_html=True)
+        st.markdown("### Step 4: Live Detection 🎯")
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            st.write("📹 **Camera is ready for live footstep detection**")
+            st.write("🚶 Walk in front of the camera and hear footsteps in real-time!")
+            # Start/Stop controls
+            col_a, col_b = st.columns(2)
+            with col_a:
+                if not st.session_state.camera_active:
+                    if st.button("▶️ Start Live Detection", type="primary", use_container_width=True):
+                        st.session_state.camera_active = True
+                        st.session_state.live_detector.start()
+                        st.rerun()
+            with col_b:
+                if st.session_state.camera_active:
+                    if st.button("⏹️ Stop Detection", type="secondary", use_container_width=True):
+                        st.session_state.camera_active = False
+                        st.session_state.live_detector.stop()
+                        st.rerun()
+        with col2:
+            if st.session_state.camera_active:
+                st.markdown('<div class="live-indicator">🔴 LIVE</div>', unsafe_allow_html=True)
+            else:
+                st.info("⏸️ Paused")
+        # Live video feed
+        if st.session_state.camera_active:
+            st.markdown("---")
+            FRAME_WINDOW = st.image([])
+            cap = cv2.VideoCapture(0)
+            if not cap.isOpened():
+                st.error("❌ Cannot access camera. Please check permissions.")
+                st.session_state.camera_active = False
+            else:
+                st.info("📹 Live feed active - Walk to generate footsteps!")
+                # Statistics
+                step_counter = st.empty()
+                left_steps = 0
+                right_steps = 0
+                try:
+                    while st.session_state.camera_active:
+                        ret, frame = cap.read()
+                        if not ret:
+                            st.error("❌ Failed to read from camera")
+                            break
+                        # Process frame
+                        processed_frame, detected_foot = st.session_state.live_detector.process_frame(frame)
+                        # Update counters
+                        if detected_foot == 'LEFT':
+                            left_steps += 1
+                        elif detected_foot == 'RIGHT':
+                            right_steps += 1
+                        # Display frame
+                        FRAME_WINDOW.image(cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB))
+                        # Update statistics
+                        step_counter.metric("Total Steps Detected", left_steps + right_steps,
+                                            f"L: {left_steps} | R: {right_steps}")
+                        # Check if user stopped
+                        if not st.session_state.camera_active:
+                            break
+                        time.sleep(0.033)  # ~30 FPS
+                except Exception as e:
+                    st.error(f"❌ Error during live detection: {str(e)}")
+                finally:
+                    cap.release()
+                    st.session_state.live_detector.stop()
+        # Reset button
+        st.markdown("---")
+        if st.button("🔄 Reset All", use_container_width=True):
+            st.session_state.floor_frame_captured = False
+            st.session_state.audio_downloaded = False
+            st.session_state.live_audio_path = None
+            st.session_state.live_detector = None
+            st.session_state.camera_active = False
+            st.rerun()
+def video_upload_mode():
+    """Original video upload mode"""
+    st.markdown('<h2>📤 Video Upload Mode</h2>', unsafe_allow_html=True)
+    # Sidebar configuration
+    sensitivity = st.sidebar.select_slider(
+        "Footstep Sensitivity",
+        options=['low', 'medium', 'high'],
+        value='medium',
+        help="Higher sensitivity detects more subtle footsteps"
+    )
+    yolo_conf = st.sidebar.slider(
+        "YOLO Confidence",
+        min_value=0.1,
+        max_value=0.9,
+        value=0.5,
+        step=0.05,
+        help="Confidence threshold for YOLO person detection"
+    )
+    surface_type = st.sidebar.selectbox(
+        "Surface Type",
+        ['concrete', 'wood', 'grass', 'gravel', 'metal'],
+        help="Select surface for audio generation"
+    )
+    use_hybrid = st.sidebar.checkbox(
+        "Enable Hybrid Mode",
+        value=True,
+        help="Use YOLO for person detection + MediaPipe for pose estimation"
+    )
+    create_annotated = st.sidebar.checkbox("Create Annotated Video", value=True)
+    add_audio = st.sidebar.checkbox("Add Footstep Audio", value=True)
+    # File uploader
+    uploaded_file = st.file_uploader(
+        "📤 Upload Video File",
+        type=['mp4', 'avi', 'mov', 'mkv'],
+        help="Upload a video file to detect footsteps"
+    )
+    if uploaded_file:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
+            tmp_file.write(uploaded_file.read())
+            video_path = tmp_file.name
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            st.subheader("📹 Input Video")
+            st.video(video_path)
+        with col2:
+            st.subheader("ℹ️ Video Info")
+            cap = cv2.VideoCapture(video_path)
+            video_info = {
+                "Duration": f"{cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS):.2f}s",
+                "FPS": f"{cap.get(cv2.CAP_PROP_FPS):.2f}",
+                "Resolution": f"{int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))}x{int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))}",
+                "Frames": int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            }
+            cap.release()
+            for key, value in video_info.items():
+                st.metric(key, value)
+            if use_hybrid:
+                st.success("🤖 Hybrid Mode Active")
+            else:
+                st.info("📊 MediaPipe Only")
+        st.markdown("---")
+        if st.button("🚀 Process Video", type="primary", use_container_width=True):
+            if use_hybrid:
+                st.info("🔄 Running Hybrid YOLO-MediaPipe Pipeline...")
+                pipeline = HybridFootstepDetectionPipeline(
+                    fps=float(video_info["FPS"]),
+                    sensitivity=sensitivity,
+                    yolo_conf=yolo_conf
+                )
+            else:
+                st.info("🔄 Running MediaPipe-Only Pipeline...")
+                pipeline = HybridFootstepDetectionPipeline(
+                    fps=float(video_info["FPS"]),
+                    sensitivity=sensitivity,
+                    yolo_conf=yolo_conf
+                )
+            with st.spinner("🔍 Detecting footsteps..."):
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                def update_progress(val):
+                    progress_bar.progress(val)
+                    status_text.text(f"Processing: {int(val * 100)}%")
+                results = pipeline.process_video(video_path, update_progress)
+                st.session_state['results'] = results
+                st.session_state['video_path'] = video_path
+                st.session_state['use_hybrid'] = use_hybrid
+                progress_bar.empty()
+                status_text.empty()
+            if results:
+                st.markdown('<div class="success-box">✅ Footstep detection complete!</div>',
+                            unsafe_allow_html=True)
+                st.success(f"Detected **{len(results['events'])}** footstep events")
+                if 'detection_stats' in results:
+                    stats = results['detection_stats']
+                    col1, col2, col3 = st.columns(3)
+                    col1.metric("YOLO Detections",
+                                f"{stats['yolo_detections']}/{stats['total_frames']}")
+                    col2.metric("Pose Detections",
+                                f"{stats['pose_detections']}/{stats['total_frames']}")
+                    col3.metric("Success Rate",
+                                f"{stats['pose_detections'] / stats['total_frames'] * 100:.1f}%")
+        # Display results (existing code continues...)
+        if 'results' in st.session_state:
+            results = st.session_state['results']
+            st.markdown("---")
+            st.subheader("📊 Detection Results")
+            col1, col2, col3, col4 = st.columns(4)
+            left_count = len([e for e in results['events'] if e['foot'] == 'LEFT'])
+            right_count = len([e for e in results['events'] if e['foot'] == 'RIGHT'])
+            avg_cadence = len(results['events']) / (results['total_frames'] / results['fps']) * 60
+            avg_conf = np.mean([e.get('confidence', 0.5) for e in results['events']])
+            col1.metric("Total Events", len(results['events']))
+            col2.metric("Left Foot", left_count)
+            col3.metric("Right Foot", right_count)
+            col4.metric("Avg Confidence", f"{avg_conf:.2f}")
+            st.metric("Average Cadence", f"{avg_cadence:.1f} steps/min")
+            st.subheader("📋 Detected Events")
+            events_df = pd.DataFrame(results['events'])
+            if not events_df.empty:
+                st.dataframe(
+                    events_df.style.apply(
+                        lambda x: ['background-color: #e8f5e9' if x.foot == 'LEFT'
+                                   else 'background-color: #fff3e0' for _ in x],
+                        axis=1
+                    ),
+                    use_container_width=True,
+                    height=300
+                )
+            st.subheader("💾 Export Options")
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                csv = events_df.to_csv(index=False)
+                st.download_button(
+                    "📄 Download CSV",
+                    csv,
+                    f"footsteps_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                    "text/csv",
+                    use_container_width=True
+                )
+            with col2:
+                json_data = json.dumps(results['events'], indent=2)
+                st.download_button(
+                    "📋 Download JSON",
+                    json_data,
+                    f"footsteps_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                    "application/json",
+                    use_container_width=True
+                )
+            with col3:
+                timecode_text = "\n".join([
+                    f"{e['timecode']}\t{e['foot']}\t{e['event']}\t{e.get('confidence', 0.5):.2f}"
+                    for e in results['events']
+                ])
+                st.download_button(
+                    "⏱️ Download Timecode",
+                    timecode_text,
+                    f"timecode_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
+                    "text/plain",
+                    use_container_width=True
+                )
+            st.markdown("---")
+            st.subheader("🎥 Generate Output Video")
+            col1, col2 = st.columns(2)
+            with col1:
+                if create_annotated and st.button("Create Annotated Video", use_container_width=True):
+                    with st.spinner("Creating annotated video..."):
+                        annotated_path = tempfile.mktemp(suffix='_annotated.mp4')
+                        progress_bar = st.progress(0)
+                        success = create_annotated_video(
+                            st.session_state['video_path'],
+                            results['events'],
+                            annotated_path,
+                            use_hybrid=st.session_state.get('use_hybrid', False),
+                            progress_callback=lambda v: progress_bar.progress(v)
+                        )
+                        if success:
+                            st.session_state['annotated_video'] = annotated_path
+                            progress_bar.empty()
+                            st.success("✅ Annotated video ready!")
+                        else:
+                            st.error("❌ Failed to create annotated video")
+            with col2:
+                if add_audio and st.button("Generate with Audio", use_container_width=True):
+                    with st.spinner("Generating audio and merging..."):
+                        audio_gen = AudioGenerator()
+                        aud_path="audio/Footsteps on Gravel Path Outdoor.mp3"
+                        duration = results['total_frames'] / results['fps']
+                        audio_track = audio_gen.create_audio_track(
+                            results['events'],
+                            aud_path,
+                            duration
+                        )
+                        temp_video = tempfile.mktemp(suffix='_temp.mp4')
+                        progress_bar = st.progress(0)
+                        create_annotated_video(
+                            st.session_state['video_path'],
+                            results['events'],
+                            temp_video,
+                            use_hybrid=st.session_state.get('use_hybrid', False),
+                            progress_callback=lambda v: progress_bar.progress(v * 0.7)
+                        )
+                        final_output = tempfile.mktemp(suffix='_final.mp4')
+                        success = merge_audio_with_video(
+                            temp_video,
+                            audio_track,
+                            44100,
+                            final_output
+                        )
+                        progress_bar.progress(1.0)
+                        progress_bar.empty()
+                        if success:
+                            st.session_state['final_video'] = final_output
+                            st.success("✅ Video with audio ready!")
+                        else:
+                            st.error("❌ Failed to merge audio")
+            if 'annotated_video' in st.session_state:
+                st.markdown("---")
+                st.subheader("📺 Annotated Video")
+                st.video(st.session_state['annotated_video'])
+                with open(st.session_state['annotated_video'], 'rb') as f:
+                    st.download_button(
+                        "📥 Download Annotated Video",
+                        f,
+                        f"annotated_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4",
+                        "video/mp4",
+                        use_container_width=True
+                    )
+            if 'final_video' in st.session_state:
+                st.markdown("---")
+                st.subheader("🔊 Final Video with Audio")
+                st.video(st.session_state['final_video'])
+                with open(st.session_state['final_video'], 'rb') as f:
+                    st.download_button(
+                        "📥 Download Final Video",
+                        f,
+                        f"final_with_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4",
+                        "video/mp4",
+                        use_container_width=True
+                    )
+def main():
+    st.markdown('<h1 class="main-header">🎬 Hybrid YOLO-MediaPipe Footstep Detection</h1>',
+                unsafe_allow_html=True)
+    st.markdown('<div class="hybrid-badge">🚀 YOLO Person Detection + MediaPipe Pose Estimation</div>',
+                unsafe_allow_html=True)
+    st.markdown("### Advanced AI-Powered Foley Tool with Dual-Stage Detection Pipeline")
+    # Mode selection
+    st.markdown("---")
+    st.markdown("## 🎯 Select Mode")
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("📤 Video Upload Mode", use_container_width=True, type="primary"):
+            st.session_state.mode = 'upload'
+    with col2:
+        if st.button("📹 Live Streaming Mode", use_container_width=True, type="primary"):
+            st.session_state.mode = 'live'
+    # Initialize mode
+    if 'mode' not in st.session_state:
+        st.session_state.mode = 'upload'
+    st.markdown("---")
+    # Display selected mode
+    if st.session_state.mode == 'upload':
+        video_upload_mode()
+    else:
+        live_streaming_mode()
+    # Sidebar info
+    with st.sidebar:
+        st.markdown("---")
+        st.markdown(f"### 🎯 Current Mode: **{st.session_state.mode.upper()}**")
+        if st.session_state.mode == 'live':
+            st.markdown("---")
+            st.markdown("### 📹 Live Mode Guide")
+            st.markdown("""
+            **Steps:**
+            1. 📸 **Capture Floor Frame**
+               - Point camera at floor
+               - Capture clear image
+            2. 🔊 **Generate Audio**
+               - AI analyzes floor type
+               - Downloads matching sound
+            3. ✅ **System Ready**
+               - Real-time detection active
+               - Walk and hear footsteps!
+            **Tips:**
+            - Good lighting needed
+            - Clear floor view
+            - Stand 2-3 meters away
+            - Walk naturally
+            """)
+        st.markdown("---")
+        st.markdown("### 🤖 Hybrid Pipeline")
+        st.markdown("""
+        **Stage 1: YOLO Detection**
+        - Detects person in frame
+        - Provides bounding box
+        - Tracks across frames
+        **Stage 2: MediaPipe Pose**
+        - Estimates pose on detected region
+        - Extracts heel landmarks
+        - Higher accuracy & speed
+        **Benefits:**
+        - ✅ More robust detection
+        - ✅ Better occlusion handling
+        - ✅ Faster processing
+        - ✅ Improved accuracy
+        """)
+        st.markdown("---")
+        st.markdown("### ℹ️ System Info")
+        st.markdown("""
+        **Detection Engines:**
+        - YOLOv8 (Person Detection)
+        - MediaPipe Pose v2 (Pose Estimation)
+        **Features:**
+        - Dual-stage AI pipeline
+        - Person tracking
+        - Frame-accurate timing
+        - Confidence scoring
+        - Real-time live detection
+        - Autonomous audio generation
+        """)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+# Core dependencies
+streamlit==1.31.1
+fastapi==0.109.0
+uvicorn[standard]==0.27.0
+python-multipart==0.0.9
+# Computer Vision & AI
+opencv-python-headless==4.9.0.80
+mediapipe==0.10.9
+ultralytics==8.1.0
+Pillow==10.2.0
+# Data Processing
+numpy==1.24.3
+pandas==2.2.0
+scipy==1.12.0
+# Audio Processing
+soundfile==0.12.1
+librosa==0.10.1
+# LangChain & AI
+langchain-core==0.1.23
+pydantic==2.6.0
+# API & Utilities
+requests==2.31.0
+python-dotenv==1.0.1
+beautifulsoup4==4.12.3
+yt-dlp==2024.3.10
+# Google AI
+absl-py==2.1.0

sound_agent.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import os
+import yt_dlp
+import requests
+from bs4 import BeautifulSoup
+import re
+import subprocess
+# Set the path to your FFmpeg executable - prioritize system ffmpeg for Docker
+def get_ffmpeg_path():
+    """Get FFmpeg path with fallback options"""
+    possible_paths = [
+        "ffmpeg",  # System ffmpeg (Docker/Linux)
+        r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe",
+        "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe",
+    ]
+    for path in possible_paths:
+        try:
+            if path == "ffmpeg" or not path.endswith('.exe'):
+                result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
+                if result.returncode == 0:
+                    return path
+            elif os.path.exists(path):
+                return path
+        except:
+            continue
+    return "ffmpeg"
+FFMPEG_PATH = get_ffmpeg_path()
+def create_audio_folder():
+    """Create audio folder if it doesn't exist"""
+    if not os.path.exists("audio"):
+        os.makedirs("audio")
+    return "audio"
+def check_ffmpeg():
+    """Check if FFmpeg is available at the specified path"""
+    if not os.path.exists(FFMPEG_PATH):
+        print(f"❌ FFmpeg not found at: {FFMPEG_PATH}")
+        print("Please check the path and make sure FFmpeg is installed.")
+        return False
+    print(f"✅ FFmpeg found at: {FFMPEG_PATH}")
+    return True
+def search_and_download_audio(audio_name):
+    """Search and download audio using yt-dlp's built-in search"""
+    audio_folder = create_audio_folder()
+    sanitized_name = sanitize_filename(audio_name)
+    # Configure yt-dlp with FFmpeg path
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'outtmpl': f'{audio_folder}/{sanitized_name}.%(ext)s',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+        'ffmpeg_location': os.path.dirname(FFMPEG_PATH),
+        'default_search': 'ytsearch',  # Use YouTube search
+        'noplaylist': True,  # Download only single video, not playlist
+    }
+    try:
+        print(f"🔍 Searching for '{audio_name}' on YouTube...")
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            # Search and download the first result
+            search_query = f"{audio_name} audio"
+            ydl.download([search_query])
+        # Check if file was created
+        mp3_file = os.path.join(audio_folder, f"{sanitized_name}.mp3")
+        if os.path.exists(mp3_file):
+            file_size = os.path.getsize(mp3_file) / (1024 * 1024)  # Size in MB
+            print(f"✅ Audio '{sanitized_name}' downloaded successfully! ({file_size:.2f} MB)")
+            return ydl_opts['outtmpl']
+        else:
+            print("❌ Downloaded file not found.")
+            return False
+    except yt_dlp.utils.DownloadError as e:
+        print(f"❌ Download error: {e}")
+        return False
+    except Exception as e:
+        print(f"❌ Unexpected error: {e}")
+        return False
+def search_youtube_improved(audio_name):
+    """Alternative search method with better headers"""
+    search_query = f"{audio_name} audio"
+    url = f"https://www.youtube.com/results?search_query={search_query.replace(' ', '+')}"
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        # Extract video IDs using regex from the page source
+        video_ids = re.findall(r'watch\?v=([a-zA-Z0-9_-]{11})', response.text)
+        # Remove duplicates and create full URLs
+        video_links = []
+        for video_id in video_ids:
+            url = f"https://www.youtube.com/watch?v={video_id}"
+            if url not in video_links:
+                video_links.append(url)
+        return video_links[:5]  # Return top 5 results
+    except Exception as e:
+        print(f"❌ Error searching YouTube: {e}")
+        return []
+def sanitize_filename(name):
+    """Remove invalid characters from filename"""
+    invalid_chars = '<>:"/\\|?*'
+    for char in invalid_chars:
+        name = name.replace(char, '')
+    return name.strip()
+def main_sound(audio_name):
+    print("🎵 Audio Downloader")
+    print("=" * 40)
+    # Check FFmpeg availability first
+    if not check_ffmpeg():
+        return None
+    if not audio_name:
+        print("❌ Please enter a valid audio name.")
+        return None
+    # Try the direct download method first (more reliable)
+    print("\n🔄 Trying direct download method...")
+    file_path = search_and_download_audio(audio_name)
+    if file_path:
+        print(f"🎉 Success! Audio saved as '{sanitize_filename(audio_name)}.mp3'")
+        return file_path
+    else:
+        print("\n🔄 Direct method failed, trying alternative search...")
+        # Try alternative search method
+        video_urls = search_youtube_improved(audio_name)
+        if not video_urls:
+            print("❌ No audio found. Please try a different name.")
+            print(
+                "💡 Try more specific terms like: 'city street sounds', 'footsteps on pavement', 'urban ambient noise'")
+            return None
+        print(f"📥 Found {len(video_urls)} results. Downloading the first one...")
+        # Download using the traditional method
+        file_path = download_audio_direct(audio_name, video_urls[0])
+        if file_path:
+            print(f"🎉 Audio saved in 'audio' folder!")
+            return file_path
+        else:
+            print("❌ All download methods failed.")
+            return None
+def download_audio_direct(audio_name, url):
+    """Direct download method for specific URLs"""
+    audio_folder = create_audio_folder()
+    sanitized_name = sanitize_filename(audio_name)
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'outtmpl': f'{audio_folder}/{sanitized_name}.%(ext)s',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+        'ffmpeg_location': os.path.dirname(FFMPEG_PATH),
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+        return ydl_opts['outtmpl']
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return False