abhi02072005 commited on
Commit
80aa632
Β·
1 Parent(s): adff71c

Add all backend files with Docker support and ffmpeg configuration

Browse files
Files changed (11) hide show
  1. .gitignore +48 -0
  2. Dockerfile +32 -0
  3. agent.py +143 -0
  4. custom_wrapper.py +55 -0
  5. link.py +669 -0
  6. link2.py +828 -0
  7. qsec.py +31 -0
  8. real.py +1572 -0
  9. reel.py +1573 -0
  10. requirements.txt +33 -0
  11. sound_agent.py +198 -0
.gitignore ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FFmpeg binaries (will be installed via Docker)
2
+ ffmpeg-7.1-essentials_build/
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+ *.so
9
+ .Python
10
+ env/
11
+ venv/
12
+ ENV/
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # IDE
30
+ .vscode/
31
+ .idea/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Environment variables
37
+ .env
38
+ .env.local
39
+
40
+ # OS
41
+ .DS_Store
42
+ Thumbs.db
43
+
44
+ # Temporary files
45
+ *.tmp
46
+ *.log
47
+ temp/
48
+ tmp/
Dockerfile ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies including ffmpeg
7
+ RUN apt-get update && apt-get install -y \
8
+ ffmpeg \
9
+ libsm6 \
10
+ libxext6 \
11
+ libxrender-dev \
12
+ libgomp1 \
13
+ libglib2.0-0 \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Copy requirements first for better caching
17
+ COPY requirements.txt .
18
+
19
+ # Install Python dependencies
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Copy application files
23
+ COPY . .
24
+
25
+ # Create necessary directories
26
+ RUN mkdir -p audio temp
27
+
28
+ # Expose port for FastAPI
29
+ EXPOSE 8000
30
+
31
+ # Run the FastAPI application
32
+ CMD ["uvicorn", "link2:app", "--host", "0.0.0.0", "--port", "8000"]
agent.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.runnables import RunnablePassthrough
2
+ from langchain_core.output_parsers import PydanticOutputParser
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from dotenv import load_dotenv
5
+ from custom_wrapper import OpenRouterChat
6
+ from pydantic import BaseModel, Field
7
+ from typing import List
8
+ import os
9
+ import json
10
+ import cv2
11
+ import base64
12
+ from PIL import Image
13
+ import io
14
+
15
+ load_dotenv()
16
+ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
17
+
18
+
19
+ class AudioSuggestionOutput(BaseModel):
20
+ audio_suggestions: List[str] = Field(default_factory=list, description="Suggested audio names for footsteps")
21
+ environment_description: str = Field(description="Description of the environment and ground surface")
22
+ reasoning: str = Field(description="Explanation for the audio suggestions")
23
+
24
+
25
+ llm = OpenRouterChat(
26
+ api_key=OPENROUTER_API_KEY,
27
+ model="meta-llama/llama-3.2-90b-vision-instruct",
28
+ temperature=0.7,
29
+ max_tokens=1024
30
+ )
31
+
32
+ parser = PydanticOutputParser(pydantic_object=AudioSuggestionOutput)
33
+
34
+
35
+ def extract_first_frame(video_path):
36
+ """Extract the first frame from a video file"""
37
+ try:
38
+ cap = cv2.VideoCapture(video_path)
39
+ if not cap.isOpened():
40
+ raise ValueError(f"Cannot open video file: {video_path}")
41
+
42
+ success, frame = cap.read()
43
+ cap.release()
44
+
45
+ if not success:
46
+ raise ValueError("Cannot read the first frame from video")
47
+
48
+ return frame
49
+ except Exception as e:
50
+ print(f"Error extracting first frame: {e}")
51
+ return None
52
+
53
+
54
+ def image_to_base64(image):
55
+ """Convert OpenCV image to base64 string"""
56
+ try:
57
+ # Convert BGR to RGB
58
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
59
+
60
+ # Convert to PIL Image
61
+ pil_image = Image.fromarray(image_rgb)
62
+
63
+ # Convert to base64
64
+ buffered = io.BytesIO()
65
+ pil_image.save(buffered, format="JPEG", quality=85)
66
+ img_str = base64.b64encode(buffered.getvalue()).decode()
67
+
68
+ return img_str
69
+ except Exception as e:
70
+ print(f"Error converting image to base64: {e}")
71
+ return None
72
+
73
+
74
+ prompt = ChatPromptTemplate.from_template("""
75
+ You are an expert sound designer and environmental analyst.
76
+ Analyze the provided image and suggest appropriate audio names for footsteps based on the environment, ground surface, and surroundings.
77
+
78
+ Image Data: {image_data}
79
+
80
+ Please analyze:
81
+ 1. The type of ground/surface (concrete, grass, wood, carpet, gravel, etc.)
82
+ 2. The environment (indoor, outdoor, urban, natural, etc.)
83
+ 3. Weather conditions if visible (wet, dry, snowy, etc.)
84
+ 4. Any other relevant factors that would affect footstep sounds
85
+ 5. Audio suggestion's name must be friendly for a youtube search
86
+ 6. Name without extensions
87
+
88
+ Provide 3-5 specific, descriptive audio file name suggestions for footsteps in this environment.
89
+ The names should be clear, concise, and follow standard audio naming conventions.
90
+
91
+ {format_instructions}
92
+ """)
93
+
94
+ chain = (
95
+ {"image_data": RunnablePassthrough(), "format_instructions": lambda x: parser.get_format_instructions()}
96
+ | prompt
97
+ | llm
98
+ | parser
99
+ )
100
+
101
+
102
+ def analyze_image_and_suggest_audio(image_base64):
103
+ """Analyze the image and suggest audio names for footsteps"""
104
+ try:
105
+ result = chain.invoke(image_base64)
106
+ return result.dict()
107
+ except Exception as e:
108
+ print("Error during image analysis:", e)
109
+ return None
110
+
111
+
112
+ def process_video_for_footstep_audio(video_path):
113
+ # Extract first frame from video
114
+ print("Extracting first frame from video...")
115
+ first_frame = extract_first_frame(video_path)
116
+
117
+ if first_frame is None:
118
+ return {"error": "Failed to extract first frame from video"}
119
+
120
+ # Convert image to base64
121
+ print("Converting image to base64...")
122
+ image_base64 = image_to_base64(first_frame)
123
+
124
+ if image_base64 is None:
125
+ return {"error": "Failed to convert image to base64"}
126
+
127
+ # Analyze image and get audio suggestions
128
+ print("Analyzing image and generating audio suggestions...")
129
+ result = analyze_image_and_suggest_audio(image_base64)
130
+
131
+ # Save results
132
+ if result:
133
+ output_file = "found_img1/gemini2.json"
134
+ os.makedirs(os.path.dirname(output_file), exist_ok=True)
135
+
136
+ with open(output_file, "w") as f:
137
+ json.dump(result, f, indent=2)
138
+
139
+ print(f"Results saved to {output_file}")
140
+
141
+ return result['audio_suggestions'][0]
142
+
143
+
custom_wrapper.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from typing import List, Optional
3
+ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
4
+ from langchain_core.outputs import ChatResult, ChatGeneration
5
+ from langchain_core.language_models import BaseChatModel
6
+ from pydantic import BaseModel, Field
7
+
8
+
9
+ class OpenRouterChat(BaseChatModel):
10
+ api_key: str = Field(...)
11
+ model: str = "mistralai/mistral-7b-instruct:free"
12
+ temperature: float = 0.7
13
+
14
+ @property
15
+ def _llm_type(self) -> str:
16
+ return "openrouter-chat"
17
+
18
+ def _format_message(self, message: BaseMessage) -> dict:
19
+ role = "user"
20
+ if isinstance(message, HumanMessage):
21
+ role = "user"
22
+ elif isinstance(message, AIMessage):
23
+ role = "assistant"
24
+ else:
25
+ raise ValueError(f"Unsupported message type: {type(message)}")
26
+ return {"role": role, "content": message.content}
27
+
28
+ def _generate(self, messages: List[BaseMessage], stop: Optional[List[str]] = None) -> ChatResult:
29
+ headers = {
30
+ "Authorization": f"Bearer {self.api_key}",
31
+ "Content-Type": "application/json",
32
+ "HTTP-Referer": "https://yourdomain.com",
33
+ "X-Title": "LangChainOpenRouterWrapper"
34
+ }
35
+
36
+ payload = {
37
+ "model": self.model,
38
+ "messages": [self._format_message(m) for m in messages],
39
+ "temperature": self.temperature
40
+ }
41
+
42
+ response = requests.post(
43
+ "https://openrouter.ai/api/v1/chat/completions",
44
+ headers=headers,
45
+ json=payload,
46
+ )
47
+
48
+ if response.status_code != 200:
49
+ raise Exception(f"OpenRouter API error {response.status_code}: {response.text}")
50
+
51
+ content = response.json()["choices"][0]["message"]["content"]
52
+
53
+ return ChatResult(
54
+ generations=[ChatGeneration(message=AIMessage(content=content))]
55
+ )
link.py ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
2
+ from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pydantic import BaseModel
5
+ from typing import Optional, List, Dict, Any
6
+ import cv2
7
+ import numpy as np
8
+ import mediapipe as mp
9
+ from pathlib import Path
10
+ import json
11
+ import subprocess
12
+ import os
13
+ import soundfile as sf
14
+ from datetime import datetime
15
+ import tempfile
16
+ import pandas as pd
17
+ import shutil
18
+ import asyncio
19
+ from concurrent.futures import ThreadPoolExecutor
20
+ import base64
21
+ from io import BytesIO
22
+
23
+ # Suppress warnings
24
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
25
+ import absl.logging
26
+
27
+ absl.logging.set_verbosity(absl.logging.ERROR)
28
+
29
+ # Mock streamlit before importing real.py
30
+ import sys
31
+
32
+
33
+ class MockStreamlit:
34
+ def __getattr__(self, name):
35
+ def mock_func(*args, **kwargs):
36
+ pass
37
+
38
+ return mock_func
39
+
40
+
41
+ sys.modules['streamlit'] = MockStreamlit()
42
+
43
+ # Import working classes and functions from real.py
44
+ from reel import (
45
+ HybridFootstepDetectionPipeline,
46
+ PersonTracker,
47
+ AudioGenerator,
48
+ create_annotated_video,
49
+ merge_audio_with_video
50
+ )
51
+
52
+ # Import your custom modules
53
+ from agent import process_video_for_footstep_audio
54
+ from sound_agent import main_sound
55
+ from qsec import extract_second_audio_librosa
56
+
57
+ app = FastAPI(title="Footstep Detection API", version="1.0.0")
58
+
59
+ # CORS middleware
60
+ app.add_middleware(
61
+ CORSMiddleware,
62
+ allow_origins=["*"],
63
+ allow_credentials=True,
64
+ allow_methods=["*"],
65
+ allow_headers=["*"],
66
+ )
67
+
68
+ # Thread pool for CPU-intensive tasks
69
+ executor = ThreadPoolExecutor(max_workers=4)
70
+
71
+
72
+ # ==================== Pydantic Models ====================
73
+
74
+ class ProcessingConfig(BaseModel):
75
+ sensitivity: str = "medium"
76
+ yolo_conf: float = 0.5
77
+ use_hybrid: bool = True
78
+ create_annotated: bool = True
79
+ add_audio: bool = True
80
+ surface_type: str = "concrete"
81
+
82
+
83
+ class FootstepEvent(BaseModel):
84
+ frame: int
85
+ timecode: str
86
+ foot: str
87
+ event: str
88
+ time_seconds: float
89
+ confidence: float
90
+
91
+
92
+ class ProcessingResult(BaseModel):
93
+ task_id: str
94
+ status: str
95
+ progress: float
96
+ events: Optional[List[FootstepEvent]] = None
97
+ total_frames: Optional[int] = None
98
+ fps: Optional[float] = None
99
+ detection_stats: Optional[Dict[str, Any]] = None
100
+ error: Optional[str] = None
101
+
102
+
103
+ class LiveDetectionConfig(BaseModel):
104
+ sensitivity: str = "medium"
105
+ yolo_conf: float = 0.5
106
+
107
+
108
+ # ==================== Storage ====================
109
+
110
+ # In-memory storage for tasks
111
+ tasks_storage = {}
112
+ video_storage = {}
113
+
114
+
115
+ def get_ffmpeg_path():
116
+ """Get FFmpeg path"""
117
+ possible_paths = [
118
+ "ffmpeg", # Try system ffmpeg first (Docker/Linux)
119
+ r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe", # Local Windows
120
+ "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe", # Relative path
121
+ ]
122
+
123
+ for path in possible_paths:
124
+ if path == "ffmpeg":
125
+ try:
126
+ result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
127
+ if result.returncode == 0:
128
+ return path
129
+ except:
130
+ continue
131
+ else:
132
+ if os.path.exists(path):
133
+ return path
134
+ return None
135
+
136
+
137
+ FFMPEG_PATH = get_ffmpeg_path()
138
+
139
+
140
+ # ==================== API Endpoints ====================
141
+
142
+ @app.get("/")
143
+ async def root():
144
+ return {"message": "Footstep Detection API", "version": "1.0.0"}
145
+
146
+
147
+ @app.post("/api/upload-video")
148
+ async def upload_video(
149
+ file: UploadFile = File(...),
150
+ config: Optional[str] = None
151
+ ):
152
+ """Upload video and create task"""
153
+ if not file.content_type.startswith('video/'):
154
+ raise HTTPException(status_code=400, detail="File must be a video")
155
+
156
+ # Generate task ID
157
+ task_id = f"task_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.urandom(4).hex()}"
158
+
159
+ # Save video to temp file
160
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
161
+ content = await file.read()
162
+ temp_file.write(content)
163
+ temp_file.close()
164
+
165
+ # Parse config
166
+ if config:
167
+ try:
168
+ config_dict = json.loads(config)
169
+ except:
170
+ config_dict = {}
171
+ else:
172
+ config_dict = {}
173
+
174
+ processing_config = ProcessingConfig(**config_dict)
175
+
176
+ # Create task
177
+ tasks_storage[task_id] = {
178
+ 'task_id': task_id,
179
+ 'status': 'uploaded',
180
+ 'progress': 0.0,
181
+ 'video_path': temp_file.name,
182
+ 'config': processing_config.dict(),
183
+ 'created_at': datetime.now().isoformat()
184
+ }
185
+
186
+ return {
187
+ "task_id": task_id,
188
+ "status": "uploaded",
189
+ "message": "Video uploaded successfully"
190
+ }
191
+
192
+
193
+ @app.post("/api/process/{task_id}")
194
+ async def process_video(task_id: str, background_tasks: BackgroundTasks):
195
+ """Start processing video"""
196
+ if task_id not in tasks_storage:
197
+ raise HTTPException(status_code=404, detail="Task not found")
198
+
199
+ task = tasks_storage[task_id]
200
+
201
+ if task['status'] == 'processing':
202
+ return {"message": "Task is already being processed"}
203
+
204
+ task['status'] = 'processing'
205
+ task['progress'] = 0.0
206
+
207
+ background_tasks.add_task(process_video_task, task_id)
208
+
209
+ return {
210
+ "task_id": task_id,
211
+ "status": "processing",
212
+ "message": "Video processing started"
213
+ }
214
+
215
+
216
+ def process_video_task(task_id: str):
217
+ """Background task for video processing"""
218
+ try:
219
+ task = tasks_storage[task_id]
220
+ config = task['config']
221
+ video_path = task['video_path']
222
+
223
+ # Get video info first
224
+ cap = cv2.VideoCapture(video_path)
225
+ fps = cap.get(cv2.CAP_PROP_FPS)
226
+ cap.release()
227
+
228
+ # Initialize pipeline using real.py's class
229
+ pipeline = HybridFootstepDetectionPipeline(
230
+ fps=fps,
231
+ sensitivity=config['sensitivity'],
232
+ yolo_conf=config['yolo_conf']
233
+ )
234
+
235
+ # Process video using real.py's method
236
+ def progress_callback(progress):
237
+ task['progress'] = progress
238
+
239
+ results = pipeline.process_video(video_path, progress_callback)
240
+
241
+ # Update task
242
+ task['status'] = 'completed'
243
+ task['progress'] = 1.0
244
+ task['results'] = results
245
+ task['completed_at'] = datetime.now().isoformat()
246
+
247
+ except Exception as e:
248
+ task['status'] = 'failed'
249
+ task['error'] = str(e)
250
+ task['failed_at'] = datetime.now().isoformat()
251
+
252
+
253
+ @app.get("/api/status/{task_id}")
254
+ async def get_task_status(task_id: str):
255
+ """Get task status and progress"""
256
+ if task_id not in tasks_storage:
257
+ raise HTTPException(status_code=404, detail="Task not found")
258
+
259
+ task = tasks_storage[task_id]
260
+
261
+ response = {
262
+ "task_id": task_id,
263
+ "status": task['status'],
264
+ "progress": task['progress']
265
+ }
266
+
267
+ if task['status'] == 'completed' and 'results' in task:
268
+ response['results'] = task['results']
269
+ elif task['status'] == 'failed':
270
+ response['error'] = task.get('error')
271
+
272
+ return response
273
+
274
+
275
+ @app.post("/api/generate-video/{task_id}")
276
+ async def generate_video(task_id: str, background_tasks: BackgroundTasks):
277
+ """Generate annotated video"""
278
+ if task_id not in tasks_storage:
279
+ raise HTTPException(status_code=404, detail="Task not found")
280
+
281
+ task = tasks_storage[task_id]
282
+
283
+ if task['status'] != 'completed':
284
+ raise HTTPException(status_code=400, detail="Processing not completed")
285
+
286
+ if not task.get('results'):
287
+ raise HTTPException(status_code=400, detail="No results available")
288
+
289
+ background_tasks.add_task(generate_video_task, task_id)
290
+
291
+ return {
292
+ "task_id": task_id,
293
+ "message": "Video generation started"
294
+ }
295
+
296
+
297
+ def generate_video_task(task_id: str):
298
+ """Background task for video generation using real.py's create_annotated_video"""
299
+ try:
300
+ print(f"[DEBUG] Starting video generation for {task_id}")
301
+ task = tasks_storage[task_id]
302
+ results = task['results']
303
+ video_path = task['video_path']
304
+ config = task['config']
305
+
306
+ task['video_generating'] = True
307
+ task['video_ready'] = False
308
+
309
+ print(f"[DEBUG] Creating annotated video for {task_id}")
310
+
311
+ # Generate output path
312
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='_annotated.mp4')
313
+ annotated_path = temp_file.name
314
+ temp_file.close()
315
+
316
+ print(f"[DEBUG] Output video path: {annotated_path}")
317
+ print(f"[DEBUG] Input video path: {video_path}")
318
+
319
+ # Use real.py's create_annotated_video function
320
+ def progress_callback(progress):
321
+ task['video_progress'] = progress
322
+ if int(progress * 100) % 10 == 0:
323
+ print(f"[DEBUG] Video generation progress: {progress * 100:.1f}%")
324
+
325
+ success = create_annotated_video(
326
+ input_path=video_path,
327
+ events=results['events'],
328
+ output_path=annotated_path,
329
+ use_hybrid=config.get('use_hybrid', True),
330
+ progress_callback=progress_callback
331
+ )
332
+
333
+ if not success:
334
+ raise Exception("Video annotation failed")
335
+
336
+ # Verify the file was created
337
+ if not os.path.exists(annotated_path):
338
+ raise Exception(f"Annotated video file was not created at {annotated_path}")
339
+
340
+ file_size = os.path.getsize(annotated_path)
341
+ print(f"[DEBUG] Annotated video file size: {file_size} bytes")
342
+
343
+ if file_size == 0:
344
+ raise Exception("Annotated video file is empty")
345
+
346
+ # Update task
347
+ task['annotated_video'] = annotated_path
348
+ task['video_ready'] = True
349
+ task['video_generating'] = False
350
+ task['video_progress'] = 1.0
351
+
352
+ print(f"[DEBUG] Video generation completed for {task_id}")
353
+ print(f"[DEBUG] Video file exists: {os.path.exists(annotated_path)}")
354
+
355
+ except Exception as e:
356
+ print(f"[ERROR] Video generation failed for {task_id}: {str(e)}")
357
+ import traceback
358
+ traceback.print_exc()
359
+ task['video_error'] = str(e)
360
+ task['video_ready'] = False
361
+ task['video_generating'] = False
362
+
363
+
364
+ @app.get("/api/video-status/{task_id}")
365
+ async def get_video_status(task_id: str):
366
+ """Check if video is ready for download"""
367
+ if task_id not in tasks_storage:
368
+ raise HTTPException(status_code=404, detail="Task not found")
369
+
370
+ task = tasks_storage[task_id]
371
+
372
+ return {
373
+ "task_id": task_id,
374
+ "video_ready": task.get('video_ready', False),
375
+ "video_generating": task.get('video_generating', False),
376
+ "video_progress": task.get('video_progress', 0.0),
377
+ "video_error": task.get('video_error', None)
378
+ }
379
+
380
+
381
+ @app.get("/api/download-video/{task_id}")
382
+ async def download_video(task_id: str):
383
+ """Download annotated video"""
384
+ if task_id not in tasks_storage:
385
+ raise HTTPException(status_code=404, detail="Task not found")
386
+
387
+ task = tasks_storage[task_id]
388
+
389
+ print(f"[DEBUG] Download request for {task_id}")
390
+ print(f"[DEBUG] Video ready: {task.get('video_ready')}")
391
+ print(f"[DEBUG] Annotated video path: {task.get('annotated_video')}")
392
+
393
+ if not task.get('video_ready'):
394
+ raise HTTPException(status_code=400, detail="Video not ready")
395
+
396
+ video_path = task.get('annotated_video')
397
+
398
+ if not video_path:
399
+ raise HTTPException(status_code=404, detail="Video path not set")
400
+
401
+ if not os.path.exists(video_path):
402
+ raise HTTPException(status_code=404, detail=f"Video file not found at {video_path}")
403
+
404
+ return FileResponse(
405
+ video_path,
406
+ media_type="video/mp4",
407
+ filename=f"annotated_{task_id}.mp4"
408
+ )
409
+
410
+
411
+ @app.get("/api/export-csv/{task_id}")
412
+ async def export_csv(task_id: str):
413
+ """Export results as CSV"""
414
+ if task_id not in tasks_storage:
415
+ raise HTTPException(status_code=404, detail="Task not found")
416
+
417
+ task = tasks_storage[task_id]
418
+
419
+ if task['status'] != 'completed' or 'results' not in task:
420
+ raise HTTPException(status_code=400, detail="No results available")
421
+
422
+ events = task['results']['events']
423
+ df = pd.DataFrame(events)
424
+
425
+ csv_buffer = BytesIO()
426
+ df.to_csv(csv_buffer, index=False)
427
+ csv_buffer.seek(0)
428
+
429
+ return StreamingResponse(
430
+ csv_buffer,
431
+ media_type="text/csv",
432
+ headers={"Content-Disposition": f"attachment; filename=footsteps_{task_id}.csv"}
433
+ )
434
+
435
+
436
+ @app.get("/api/export-json/{task_id}")
437
+ async def export_json(task_id: str):
438
+ """Export results as JSON"""
439
+ if task_id not in tasks_storage:
440
+ raise HTTPException(status_code=404, detail="Task not found")
441
+
442
+ task = tasks_storage[task_id]
443
+
444
+ if task['status'] != 'completed' or 'results' not in task:
445
+ raise HTTPException(status_code=400, detail="No results available")
446
+
447
+ return JSONResponse(content=task['results'])
448
+
449
+
450
+ @app.post("/api/generate-audio-video/{task_id}")
451
+ async def generate_audio_video(task_id: str, background_tasks: BackgroundTasks):
452
+ """Generate annotated video with footstep audio"""
453
+ if task_id not in tasks_storage:
454
+ raise HTTPException(status_code=404, detail="Task not found")
455
+
456
+ task = tasks_storage[task_id]
457
+
458
+ if task['status'] != 'completed':
459
+ raise HTTPException(status_code=400, detail="Processing not completed")
460
+
461
+ if not task.get('results'):
462
+ raise HTTPException(status_code=400, detail="No results available")
463
+
464
+ background_tasks.add_task(generate_audio_video_task, task_id)
465
+
466
+ return {
467
+ "task_id": task_id,
468
+ "message": "Audio video generation started"
469
+ }
470
+
471
+
472
+ def generate_audio_video_task(task_id: str):
473
+ """Background task for generating video with audio using real.py's functions"""
474
+ try:
475
+ print(f"[DEBUG] Starting audio video generation for {task_id}")
476
+ task = tasks_storage[task_id]
477
+ results = task['results']
478
+ video_path = task['video_path']
479
+ config = task['config']
480
+
481
+ task['audio_video_generating'] = True
482
+ task['audio_video_ready'] = False
483
+
484
+ # Step 1: Generate audio track
485
+ print(f"[DEBUG] Generating audio track...")
486
+ audio_gen = AudioGenerator()
487
+
488
+ # Get audio file for surface type
489
+ '''surface_type = config.get('surface_type', 'concrete')
490
+ aud_name = process_video_for_footstep_audio(str(video_path))
491
+ aud_path = main_sound(aud_name)
492
+ aud_path = aud_path['default'].replace(".%(ext)s", ".mp3")'''
493
+
494
+ aud_path="audio/Footsteps on Gravel Path Outdoor.mp3"
495
+
496
+ duration = results['total_frames'] / results['fps']
497
+ audio_track = audio_gen.create_audio_track(
498
+ results['events'],
499
+ aud_path,
500
+ duration
501
+ )
502
+
503
+ task['audio_video_progress'] = 0.3
504
+
505
+ # Step 2: Create annotated video
506
+ print(f"[DEBUG] Creating annotated video...")
507
+ temp_video = tempfile.NamedTemporaryFile(delete=False, suffix='_temp.mp4')
508
+ temp_video_path = temp_video.name
509
+ temp_video.close()
510
+
511
+ def video_progress(progress):
512
+ task['audio_video_progress'] = 0.3 + (progress * 0.4) # 30-70%
513
+
514
+ success = create_annotated_video(
515
+ input_path=video_path,
516
+ events=results['events'],
517
+ output_path=temp_video_path,
518
+ use_hybrid=config.get('use_hybrid', True),
519
+ progress_callback=video_progress
520
+ )
521
+
522
+ if not success:
523
+ raise Exception("Video annotation failed")
524
+
525
+ task['audio_video_progress'] = 0.7
526
+
527
+ # Step 3: Merge audio with video
528
+ print(f"[DEBUG] Merging audio with video...")
529
+ final_output = tempfile.NamedTemporaryFile(delete=False, suffix='_final.mp4')
530
+ final_output_path = final_output.name
531
+ final_output.close()
532
+
533
+ merge_success = merge_audio_with_video(
534
+ temp_video_path,
535
+ audio_track,
536
+ 44100,
537
+ final_output_path
538
+ )
539
+
540
+ if not merge_success:
541
+ raise Exception("Audio merge failed")
542
+
543
+ # Cleanup temp video
544
+ if os.path.exists(temp_video_path):
545
+ os.remove(temp_video_path)
546
+
547
+ # Verify final file
548
+ if not os.path.exists(final_output_path):
549
+ raise Exception(f"Final video file was not created at {final_output_path}")
550
+
551
+ file_size = os.path.getsize(final_output_path)
552
+ print(f"[DEBUG] Final video file size: {file_size} bytes")
553
+
554
+ if file_size == 0:
555
+ raise Exception("Final video file is empty")
556
+
557
+ # Update task
558
+ task['audio_video_path'] = final_output_path
559
+ task['audio_video_ready'] = True
560
+ task['audio_video_generating'] = False
561
+ task['audio_video_progress'] = 1.0
562
+
563
+ print(f"[DEBUG] Audio video generation completed for {task_id}")
564
+
565
+ except Exception as e:
566
+ print(f"[ERROR] Audio video generation failed for {task_id}: {str(e)}")
567
+ import traceback
568
+ traceback.print_exc()
569
+ task['audio_video_error'] = str(e)
570
+ task['audio_video_ready'] = False
571
+ task['audio_video_generating'] = False
572
+
573
+
574
+ @app.get("/api/audio-video-status/{task_id}")
575
+ async def get_audio_video_status(task_id: str):
576
+ """Check if audio video is ready for download"""
577
+ if task_id not in tasks_storage:
578
+ raise HTTPException(status_code=404, detail="Task not found")
579
+
580
+ task = tasks_storage[task_id]
581
+
582
+ return {
583
+ "task_id": task_id,
584
+ "audio_video_ready": task.get('audio_video_ready', False),
585
+ "audio_video_generating": task.get('audio_video_generating', False),
586
+ "audio_video_progress": task.get('audio_video_progress', 0.0),
587
+ "audio_video_error": task.get('audio_video_error', None)
588
+ }
589
+
590
+
591
+ @app.get("/api/download-audio-video/{task_id}")
592
+ async def download_audio_video(task_id: str):
593
+ """Download video with audio"""
594
+ if task_id not in tasks_storage:
595
+ raise HTTPException(status_code=404, detail="Task not found")
596
+
597
+ task = tasks_storage[task_id]
598
+
599
+ if not task.get('audio_video_ready'):
600
+ raise HTTPException(status_code=400, detail="Audio video not ready")
601
+
602
+ video_path = task.get('audio_video_path')
603
+
604
+ if not video_path:
605
+ raise HTTPException(status_code=404, detail="Video path not set")
606
+
607
+ if not os.path.exists(video_path):
608
+ raise HTTPException(status_code=404, detail=f"Video file not found at {video_path}")
609
+
610
+ return FileResponse(
611
+ video_path,
612
+ media_type="video/mp4",
613
+ filename=f"footsteps_with_audio_{task_id}.mp4"
614
+ )
615
+
616
+
617
+ @app.post("/api/live/capture-floor")
618
+ async def capture_floor_frame(file: UploadFile = File(...)):
619
+ """Capture floor frame for live mode"""
620
+ if not file.content_type.startswith('image/'):
621
+ raise HTTPException(status_code=400, detail="File must be an image")
622
+
623
+ session_id = f"live_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.urandom(4).hex()}"
624
+
625
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
626
+ content = await file.read()
627
+ temp_file.write(content)
628
+ temp_file.close()
629
+
630
+ tasks_storage[session_id] = {
631
+ 'type': 'live',
632
+ 'floor_frame': temp_file.name,
633
+ 'created_at': datetime.now().isoformat()
634
+ }
635
+
636
+ return {
637
+ "session_id": session_id,
638
+ "message": "Floor frame captured"
639
+ }
640
+
641
+
642
+ @app.post("/api/live/detect-frame/{session_id}")
643
+ async def detect_frame(session_id: str, file: UploadFile = File(...)):
644
+ """Detect footsteps in a single frame"""
645
+ if session_id not in tasks_storage:
646
+ raise HTTPException(status_code=404, detail="Session not found")
647
+
648
+ if not file.content_type.startswith('image/'):
649
+ raise HTTPException(status_code=400, detail="File must be an image")
650
+
651
+ # Read frame
652
+ content = await file.read()
653
+ nparr = np.frombuffer(content, np.uint8)
654
+ frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
655
+
656
+ # TODO: Implement real-time detection
657
+ # This would use the LiveFootstepDetector class from real.py
658
+
659
+ return {
660
+ "session_id": session_id,
661
+ "detected": False,
662
+ "message": "Frame processed"
663
+ }
664
+
665
+ '''
666
+ if __name__ == "__main__":
667
+ import uvicorn
668
+
669
+ uvicorn.run(app, host="0.0.0.0", port=8000)'''
link2.py ADDED
@@ -0,0 +1,828 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
2
+ from fastapi.responses import FileResponse, JSONResponse, StreamingResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pydantic import BaseModel
5
+ from typing import Optional, List, Dict, Any
6
+ import cv2
7
+ import numpy as np
8
+ import mediapipe as mp
9
+ from pathlib import Path
10
+ import json
11
+ import subprocess
12
+ import os
13
+ import soundfile as sf
14
+ from datetime import datetime
15
+ import tempfile
16
+ import pandas as pd
17
+ import shutil
18
+ import asyncio
19
+ from concurrent.futures import ThreadPoolExecutor
20
+ import base64
21
+ from io import BytesIO
22
+
23
+ # Suppress warnings
24
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
25
+ import absl.logging
26
+
27
+ absl.logging.set_verbosity(absl.logging.ERROR)
28
+
29
+ # Mock streamlit before importing real.py
30
+ import sys
31
+
32
+
33
+ class MockStreamlit:
34
+ def __getattr__(self, name):
35
+ def mock_func(*args, **kwargs):
36
+ pass
37
+
38
+ return mock_func
39
+
40
+
41
+ sys.modules['streamlit'] = MockStreamlit()
42
+
43
+ # Import working classes and functions from real.py
44
+ from real import (
45
+ HybridFootstepDetectionPipeline,
46
+ PersonTracker,
47
+ AudioGenerator,
48
+ LiveFootstepDetector,
49
+ create_annotated_video,
50
+ merge_audio_with_video
51
+ )
52
+
53
+ # Import your custom modules
54
+ from agent import process_video_for_footstep_audio
55
+ from sound_agent import main_sound
56
+ from qsec import extract_second_audio_librosa
57
+
58
+ app = FastAPI(title="Footstep Detection API", version="1.0.0")
59
+
60
+ # CORS middleware
61
+ app.add_middleware(
62
+ CORSMiddleware,
63
+ allow_origins=["*"],
64
+ allow_credentials=True,
65
+ allow_methods=["*"],
66
+ allow_headers=["*"],
67
+ )
68
+
69
+ # Thread pool for CPU-intensive tasks
70
+ executor = ThreadPoolExecutor(max_workers=4)
71
+
72
+
73
+ # ==================== Pydantic Models ====================
74
+
75
+ class ProcessingConfig(BaseModel):
76
+ sensitivity: str = "medium"
77
+ yolo_conf: float = 0.5
78
+ use_hybrid: bool = True
79
+ create_annotated: bool = True
80
+ add_audio: bool = True
81
+ surface_type: str = "concrete"
82
+
83
+
84
+ class FootstepEvent(BaseModel):
85
+ frame: int
86
+ timecode: str
87
+ foot: str
88
+ event: str
89
+ time_seconds: float
90
+ confidence: float
91
+
92
+
93
+ class ProcessingResult(BaseModel):
94
+ task_id: str
95
+ status: str
96
+ progress: float
97
+ events: Optional[List[FootstepEvent]] = None
98
+ total_frames: Optional[int] = None
99
+ fps: Optional[float] = None
100
+ detection_stats: Optional[Dict[str, Any]] = None
101
+ error: Optional[str] = None
102
+
103
+
104
+ class LiveDetectionConfig(BaseModel):
105
+ sensitivity: str = "medium"
106
+ yolo_conf: float = 0.5
107
+
108
+
109
+ # ==================== Storage ====================
110
+
111
+ # In-memory storage for tasks
112
+ tasks_storage = {}
113
+ video_storage = {}
114
+
115
+
116
+ def get_ffmpeg_path():
117
+ """Get FFmpeg path"""
118
+ possible_paths = [
119
+ "ffmpeg", # Try system ffmpeg first (Docker/Linux)
120
+ r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe", # Local Windows
121
+ "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe", # Relative path
122
+ ]
123
+
124
+ for path in possible_paths:
125
+ if path == "ffmpeg":
126
+ try:
127
+ result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
128
+ if result.returncode == 0:
129
+ return path
130
+ except:
131
+ continue
132
+ else:
133
+ if os.path.exists(path):
134
+ return path
135
+ return None
136
+
137
+
138
+ FFMPEG_PATH = get_ffmpeg_path()
139
+
140
+
141
+ # ==================== API Endpoints ====================
142
+
143
+ @app.get("/")
144
+ async def root():
145
+ return {"message": "Footstep Detection API", "version": "1.0.0"}
146
+
147
+
148
+ @app.post("/api/upload-video")
149
+ async def upload_video(
150
+ file: UploadFile = File(...),
151
+ config: Optional[str] = None
152
+ ):
153
+ """Upload video and create task"""
154
+ if not file.content_type.startswith('video/'):
155
+ raise HTTPException(status_code=400, detail="File must be a video")
156
+
157
+ # Generate task ID
158
+ task_id = f"task_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.urandom(4).hex()}"
159
+
160
+ # Save video to temp file
161
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
162
+ content = await file.read()
163
+ temp_file.write(content)
164
+ temp_file.close()
165
+
166
+ # Parse config
167
+ if config:
168
+ try:
169
+ config_dict = json.loads(config)
170
+ except:
171
+ config_dict = {}
172
+ else:
173
+ config_dict = {}
174
+
175
+ processing_config = ProcessingConfig(**config_dict)
176
+
177
+ # Create task
178
+ tasks_storage[task_id] = {
179
+ 'task_id': task_id,
180
+ 'status': 'uploaded',
181
+ 'progress': 0.0,
182
+ 'video_path': temp_file.name,
183
+ 'config': processing_config.dict(),
184
+ 'created_at': datetime.now().isoformat()
185
+ }
186
+
187
+ return {
188
+ "task_id": task_id,
189
+ "status": "uploaded",
190
+ "message": "Video uploaded successfully"
191
+ }
192
+
193
+
194
+ @app.post("/api/process/{task_id}")
195
+ async def process_video(task_id: str, background_tasks: BackgroundTasks):
196
+ """Start processing video"""
197
+ if task_id not in tasks_storage:
198
+ raise HTTPException(status_code=404, detail="Task not found")
199
+
200
+ task = tasks_storage[task_id]
201
+
202
+ if task['status'] == 'processing':
203
+ return {"message": "Task is already being processed"}
204
+
205
+ task['status'] = 'processing'
206
+ task['progress'] = 0.0
207
+
208
+ background_tasks.add_task(process_video_task, task_id)
209
+
210
+ return {
211
+ "task_id": task_id,
212
+ "status": "processing",
213
+ "message": "Video processing started"
214
+ }
215
+
216
+
217
+ def process_video_task(task_id: str):
218
+ """Background task for video processing"""
219
+ try:
220
+ task = tasks_storage[task_id]
221
+ config = task['config']
222
+ video_path = task['video_path']
223
+
224
+ # Get video info first
225
+ cap = cv2.VideoCapture(video_path)
226
+ fps = cap.get(cv2.CAP_PROP_FPS)
227
+ cap.release()
228
+
229
+ # Initialize pipeline using real.py's class
230
+ pipeline = HybridFootstepDetectionPipeline(
231
+ fps=fps,
232
+ sensitivity=config['sensitivity'],
233
+ yolo_conf=config['yolo_conf']
234
+ )
235
+
236
+ # Process video using real.py's method
237
+ def progress_callback(progress):
238
+ task['progress'] = progress
239
+
240
+ results = pipeline.process_video(video_path, progress_callback)
241
+
242
+ # Update task
243
+ task['status'] = 'completed'
244
+ task['progress'] = 1.0
245
+ task['results'] = results
246
+ task['completed_at'] = datetime.now().isoformat()
247
+
248
+ except Exception as e:
249
+ task['status'] = 'failed'
250
+ task['error'] = str(e)
251
+ task['failed_at'] = datetime.now().isoformat()
252
+
253
+
254
+ @app.get("/api/status/{task_id}")
255
+ async def get_task_status(task_id: str):
256
+ """Get task status and progress"""
257
+ if task_id not in tasks_storage:
258
+ raise HTTPException(status_code=404, detail="Task not found")
259
+
260
+ task = tasks_storage[task_id]
261
+
262
+ response = {
263
+ "task_id": task_id,
264
+ "status": task['status'],
265
+ "progress": task['progress']
266
+ }
267
+
268
+ if task['status'] == 'completed' and 'results' in task:
269
+ response['results'] = task['results']
270
+ elif task['status'] == 'failed':
271
+ response['error'] = task.get('error')
272
+
273
+ return response
274
+
275
+
276
+ @app.post("/api/generate-video/{task_id}")
277
+ async def generate_video(task_id: str, background_tasks: BackgroundTasks):
278
+ """Generate annotated video"""
279
+ if task_id not in tasks_storage:
280
+ raise HTTPException(status_code=404, detail="Task not found")
281
+
282
+ task = tasks_storage[task_id]
283
+
284
+ if task['status'] != 'completed':
285
+ raise HTTPException(status_code=400, detail="Processing not completed")
286
+
287
+ if not task.get('results'):
288
+ raise HTTPException(status_code=400, detail="No results available")
289
+
290
+ background_tasks.add_task(generate_video_task, task_id)
291
+
292
+ return {
293
+ "task_id": task_id,
294
+ "message": "Video generation started"
295
+ }
296
+
297
+
298
+ def generate_video_task(task_id: str):
299
+ """Background task for video generation using real.py's create_annotated_video"""
300
+ try:
301
+ print(f"[DEBUG] Starting video generation for {task_id}")
302
+ task = tasks_storage[task_id]
303
+ results = task['results']
304
+ video_path = task['video_path']
305
+ config = task['config']
306
+
307
+ task['video_generating'] = True
308
+ task['video_ready'] = False
309
+
310
+ print(f"[DEBUG] Creating annotated video for {task_id}")
311
+
312
+ # Generate output path
313
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='_annotated.mp4')
314
+ annotated_path = temp_file.name
315
+ temp_file.close()
316
+
317
+ print(f"[DEBUG] Output video path: {annotated_path}")
318
+ print(f"[DEBUG] Input video path: {video_path}")
319
+
320
+ # Use real.py's create_annotated_video function
321
+ def progress_callback(progress):
322
+ task['video_progress'] = progress
323
+ if int(progress * 100) % 10 == 0:
324
+ print(f"[DEBUG] Video generation progress: {progress * 100:.1f}%")
325
+
326
+ success = create_annotated_video(
327
+ input_path=video_path,
328
+ events=results['events'],
329
+ output_path=annotated_path,
330
+ use_hybrid=config.get('use_hybrid', True),
331
+ progress_callback=progress_callback
332
+ )
333
+
334
+ if not success:
335
+ raise Exception("Video annotation failed")
336
+
337
+ # Verify the file was created
338
+ if not os.path.exists(annotated_path):
339
+ raise Exception(f"Annotated video file was not created at {annotated_path}")
340
+
341
+ file_size = os.path.getsize(annotated_path)
342
+ print(f"[DEBUG] Annotated video file size: {file_size} bytes")
343
+
344
+ if file_size == 0:
345
+ raise Exception("Annotated video file is empty")
346
+
347
+ # Update task
348
+ task['annotated_video'] = annotated_path
349
+ task['video_ready'] = True
350
+ task['video_generating'] = False
351
+ task['video_progress'] = 1.0
352
+
353
+ print(f"[DEBUG] Video generation completed for {task_id}")
354
+ print(f"[DEBUG] Video file exists: {os.path.exists(annotated_path)}")
355
+
356
+ except Exception as e:
357
+ print(f"[ERROR] Video generation failed for {task_id}: {str(e)}")
358
+ import traceback
359
+ traceback.print_exc()
360
+ task['video_error'] = str(e)
361
+ task['video_ready'] = False
362
+ task['video_generating'] = False
363
+
364
+
365
+ @app.get("/api/video-status/{task_id}")
366
+ async def get_video_status(task_id: str):
367
+ """Check if video is ready for download"""
368
+ if task_id not in tasks_storage:
369
+ raise HTTPException(status_code=404, detail="Task not found")
370
+
371
+ task = tasks_storage[task_id]
372
+
373
+ return {
374
+ "task_id": task_id,
375
+ "video_ready": task.get('video_ready', False),
376
+ "video_generating": task.get('video_generating', False),
377
+ "video_progress": task.get('video_progress', 0.0),
378
+ "video_error": task.get('video_error', None)
379
+ }
380
+
381
+
382
+ @app.get("/api/download-video/{task_id}")
383
+ async def download_video(task_id: str):
384
+ """Download annotated video"""
385
+ if task_id not in tasks_storage:
386
+ raise HTTPException(status_code=404, detail="Task not found")
387
+
388
+ task = tasks_storage[task_id]
389
+
390
+ print(f"[DEBUG] Download request for {task_id}")
391
+ print(f"[DEBUG] Video ready: {task.get('video_ready')}")
392
+ print(f"[DEBUG] Annotated video path: {task.get('annotated_video')}")
393
+
394
+ if not task.get('video_ready'):
395
+ raise HTTPException(status_code=400, detail="Video not ready")
396
+
397
+ video_path = task.get('annotated_video')
398
+
399
+ if not video_path:
400
+ raise HTTPException(status_code=404, detail="Video path not set")
401
+
402
+ if not os.path.exists(video_path):
403
+ raise HTTPException(status_code=404, detail=f"Video file not found at {video_path}")
404
+
405
+ return FileResponse(
406
+ video_path,
407
+ media_type="video/mp4",
408
+ filename=f"annotated_{task_id}.mp4"
409
+ )
410
+
411
+
412
+ @app.get("/api/export-csv/{task_id}")
413
+ async def export_csv(task_id: str):
414
+ """Export results as CSV"""
415
+ if task_id not in tasks_storage:
416
+ raise HTTPException(status_code=404, detail="Task not found")
417
+
418
+ task = tasks_storage[task_id]
419
+
420
+ if task['status'] != 'completed' or 'results' not in task:
421
+ raise HTTPException(status_code=400, detail="No results available")
422
+
423
+ events = task['results']['events']
424
+ df = pd.DataFrame(events)
425
+
426
+ csv_buffer = BytesIO()
427
+ df.to_csv(csv_buffer, index=False)
428
+ csv_buffer.seek(0)
429
+
430
+ return StreamingResponse(
431
+ csv_buffer,
432
+ media_type="text/csv",
433
+ headers={"Content-Disposition": f"attachment; filename=footsteps_{task_id}.csv"}
434
+ )
435
+
436
+
437
+ @app.get("/api/export-json/{task_id}")
438
+ async def export_json(task_id: str):
439
+ """Export results as JSON"""
440
+ if task_id not in tasks_storage:
441
+ raise HTTPException(status_code=404, detail="Task not found")
442
+
443
+ task = tasks_storage[task_id]
444
+
445
+ if task['status'] != 'completed' or 'results' not in task:
446
+ raise HTTPException(status_code=400, detail="No results available")
447
+
448
+ return JSONResponse(content=task['results'])
449
+
450
+
451
+ @app.post("/api/generate-audio-video/{task_id}")
452
+ async def generate_audio_video(task_id: str, background_tasks: BackgroundTasks):
453
+ """Generate annotated video with footstep audio"""
454
+ if task_id not in tasks_storage:
455
+ raise HTTPException(status_code=404, detail="Task not found")
456
+
457
+ task = tasks_storage[task_id]
458
+
459
+ if task['status'] != 'completed':
460
+ raise HTTPException(status_code=400, detail="Processing not completed")
461
+
462
+ if not task.get('results'):
463
+ raise HTTPException(status_code=400, detail="No results available")
464
+
465
+ background_tasks.add_task(generate_audio_video_task, task_id)
466
+
467
+ return {
468
+ "task_id": task_id,
469
+ "message": "Audio video generation started"
470
+ }
471
+
472
+
473
+ def generate_audio_video_task(task_id: str):
474
+ """Background task for generating video with audio using real.py's functions"""
475
+ try:
476
+ print(f"[DEBUG] Starting audio video generation for {task_id}")
477
+ task = tasks_storage[task_id]
478
+ results = task['results']
479
+ video_path = task['video_path']
480
+ config = task['config']
481
+
482
+ task['audio_video_generating'] = True
483
+ task['audio_video_ready'] = False
484
+
485
+ # Step 1: Generate audio track
486
+ print(f"[DEBUG] Generating audio track...")
487
+ audio_gen = AudioGenerator()
488
+
489
+ # Get audio file for surface type
490
+ surface_type = config.get('surface_type', 'concrete')
491
+ '''aud_name = process_video_for_footstep_audio(str(video_path))
492
+ aud_path = main_sound(aud_name)
493
+ aud_path = aud_path['default'].replace(".%(ext)s", ".mp3")'''
494
+
495
+ aud_path = "audio/Footsteps on Gravel Path Outdoor.mp3"
496
+
497
+ duration = results['total_frames'] / results['fps']
498
+ audio_track = audio_gen.create_audio_track(
499
+ results['events'],
500
+ aud_path,
501
+ duration
502
+ )
503
+
504
+ task['audio_video_progress'] = 0.3
505
+
506
+ # Step 2: Create annotated video
507
+ print(f"[DEBUG] Creating annotated video...")
508
+ temp_video = tempfile.NamedTemporaryFile(delete=False, suffix='_temp.mp4')
509
+ temp_video_path = temp_video.name
510
+ temp_video.close()
511
+
512
+ def video_progress(progress):
513
+ task['audio_video_progress'] = 0.3 + (progress * 0.4) # 30-70%
514
+
515
+ success = create_annotated_video(
516
+ input_path=video_path,
517
+ events=results['events'],
518
+ output_path=temp_video_path,
519
+ use_hybrid=config.get('use_hybrid', True),
520
+ progress_callback=video_progress
521
+ )
522
+
523
+ if not success:
524
+ raise Exception("Video annotation failed")
525
+
526
+ task['audio_video_progress'] = 0.7
527
+
528
+ # Step 3: Merge audio with video
529
+ print(f"[DEBUG] Merging audio with video...")
530
+ final_output = tempfile.NamedTemporaryFile(delete=False, suffix='_final.mp4')
531
+ final_output_path = final_output.name
532
+ final_output.close()
533
+
534
+ merge_success = merge_audio_with_video(
535
+ temp_video_path,
536
+ audio_track,
537
+ 44100,
538
+ final_output_path
539
+ )
540
+
541
+ if not merge_success:
542
+ raise Exception("Audio merge failed")
543
+
544
+ # Cleanup temp video
545
+ if os.path.exists(temp_video_path):
546
+ os.remove(temp_video_path)
547
+
548
+ # Verify final file
549
+ if not os.path.exists(final_output_path):
550
+ raise Exception(f"Final video file was not created at {final_output_path}")
551
+
552
+ file_size = os.path.getsize(final_output_path)
553
+ print(f"[DEBUG] Final video file size: {file_size} bytes")
554
+
555
+ if file_size == 0:
556
+ raise Exception("Final video file is empty")
557
+
558
+ # Update task
559
+ task['audio_video_path'] = final_output_path
560
+ task['audio_video_ready'] = True
561
+ task['audio_video_generating'] = False
562
+ task['audio_video_progress'] = 1.0
563
+
564
+ print(f"[DEBUG] Audio video generation completed for {task_id}")
565
+
566
+ except Exception as e:
567
+ print(f"[ERROR] Audio video generation failed for {task_id}: {str(e)}")
568
+ import traceback
569
+ traceback.print_exc()
570
+ task['audio_video_error'] = str(e)
571
+ task['audio_video_ready'] = False
572
+ task['audio_video_generating'] = False
573
+
574
+
575
+ @app.get("/api/audio-video-status/{task_id}")
576
+ async def get_audio_video_status(task_id: str):
577
+ """Check if audio video is ready for download"""
578
+ if task_id not in tasks_storage:
579
+ raise HTTPException(status_code=404, detail="Task not found")
580
+
581
+ task = tasks_storage[task_id]
582
+
583
+ return {
584
+ "task_id": task_id,
585
+ "audio_video_ready": task.get('audio_video_ready', False),
586
+ "audio_video_generating": task.get('audio_video_generating', False),
587
+ "audio_video_progress": task.get('audio_video_progress', 0.0),
588
+ "audio_video_error": task.get('audio_video_error', None)
589
+ }
590
+
591
+
592
+ @app.get("/api/download-audio-video/{task_id}")
593
+ async def download_audio_video(task_id: str):
594
+ """Download video with audio"""
595
+ if task_id not in tasks_storage:
596
+ raise HTTPException(status_code=404, detail="Task not found")
597
+
598
+ task = tasks_storage[task_id]
599
+
600
+ if not task.get('audio_video_ready'):
601
+ raise HTTPException(status_code=400, detail="Audio video not ready")
602
+
603
+ video_path = task.get('audio_video_path')
604
+
605
+ if not video_path:
606
+ raise HTTPException(status_code=404, detail="Video path not set")
607
+
608
+ if not os.path.exists(video_path):
609
+ raise HTTPException(status_code=404, detail=f"Video file not found at {video_path}")
610
+
611
+ return FileResponse(
612
+ video_path,
613
+ media_type="video/mp4",
614
+ filename=f"footsteps_with_audio_{task_id}.mp4"
615
+ )
616
+
617
+
618
+ @app.post("/api/live/capture-floor")
619
+ async def capture_floor_frame(file: UploadFile = File(...)):
620
+ """Capture floor frame for live mode"""
621
+ if not file.content_type.startswith('image/'):
622
+ raise HTTPException(status_code=400, detail="File must be an image")
623
+
624
+ session_id = f"live_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{os.urandom(4).hex()}"
625
+
626
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
627
+ content = await file.read()
628
+ temp_file.write(content)
629
+ temp_file.close()
630
+
631
+ tasks_storage[session_id] = {
632
+ 'type': 'live',
633
+ 'floor_frame': temp_file.name,
634
+ 'created_at': datetime.now().isoformat()
635
+ }
636
+
637
+ return {
638
+ "session_id": session_id,
639
+ "message": "Floor frame captured"
640
+ }
641
+
642
+
643
+ @app.post("/api/live/detect-frame/{session_id}")
644
+ async def detect_frame(session_id: str, file: UploadFile = File(...)):
645
+ """Detect footsteps in a single frame using LiveFootstepDetector"""
646
+ if session_id not in tasks_storage:
647
+ raise HTTPException(status_code=404, detail="Session not found")
648
+
649
+ if not file.content_type.startswith('image/'):
650
+ raise HTTPException(status_code=400, detail="File must be an image")
651
+
652
+ session = tasks_storage[session_id]
653
+
654
+ # Read frame
655
+ content = await file.read()
656
+ nparr = np.frombuffer(content, np.uint8)
657
+ frame = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
658
+
659
+ if frame is None:
660
+ raise HTTPException(status_code=400, detail="Failed to decode frame")
661
+
662
+ # Initialize detector if not already done
663
+ if 'detector' not in session:
664
+ try:
665
+ # Get audio path from session or use default
666
+ audio_path = session.get('audio_path', 'backend/audio/UrbanFootstepsConcrete.mp3')
667
+ sensitivity = session.get('sensitivity', 'medium')
668
+ yolo_conf = session.get('yolo_conf', 0.5)
669
+
670
+ # Check if audio file exists
671
+ if not os.path.exists(audio_path):
672
+ # Try alternative paths
673
+ alt_paths = [
674
+ 'audio/UrbanFootstepsConcrete.mp3',
675
+ 'backend/audio/Footsteps on Gravel Path Outdoor.mp3',
676
+ 'audio/Footsteps on Gravel Path Outdoor.mp3'
677
+ ]
678
+ audio_found = False
679
+ for alt_path in alt_paths:
680
+ if os.path.exists(alt_path):
681
+ audio_path = alt_path
682
+ audio_found = True
683
+ break
684
+ if not audio_found:
685
+ raise HTTPException(status_code=404,
686
+ detail=f"Audio file not found. Searched paths: {audio_path}, {alt_paths}")
687
+
688
+ # Create detector instance
689
+ detector = LiveFootstepDetector(
690
+ audio_path=audio_path,
691
+ sensitivity=sensitivity,
692
+ yolo_conf=yolo_conf
693
+ )
694
+ detector.start() # Start audio playback thread
695
+
696
+ session['detector'] = detector
697
+ session['detector_started'] = datetime.now().isoformat()
698
+
699
+ except Exception as e:
700
+ raise HTTPException(status_code=500, detail=f"Failed to initialize detector: {str(e)}")
701
+
702
+ detector = session['detector']
703
+
704
+ # Process frame with detector
705
+ try:
706
+ processed_frame, detected_foot = detector.process_frame(frame)
707
+
708
+ # Encode processed frame back to JPEG
709
+ _, buffer = cv2.imencode('.jpg', processed_frame)
710
+ frame_base64 = base64.b64encode(buffer).decode('utf-8')
711
+
712
+ response = {
713
+ "session_id": session_id,
714
+ "detected": detected_foot is not None,
715
+ "foot": detected_foot, # 'LEFT', 'RIGHT', or None
716
+ "frame": frame_base64, # Processed frame with annotations
717
+ "message": f"{detected_foot} STRIKE!" if detected_foot else "Frame processed"
718
+ }
719
+
720
+ # Update session stats
721
+ if 'detection_count' not in session:
722
+ session['detection_count'] = 0
723
+ if detected_foot:
724
+ session['detection_count'] += 1
725
+ session['last_detection'] = {
726
+ 'foot': detected_foot,
727
+ 'timestamp': datetime.now().isoformat()
728
+ }
729
+
730
+ return response
731
+
732
+ except Exception as e:
733
+ raise HTTPException(status_code=500, detail=f"Frame processing error: {str(e)}")
734
+
735
+
736
+ @app.post("/api/live/generate-audio/{session_id}")
737
+ async def generate_audio(session_id: str):
738
+ """Generate audio for live detection based on floor analysis"""
739
+ if session_id not in tasks_storage:
740
+ raise HTTPException(status_code=404, detail="Session not found")
741
+
742
+ session = tasks_storage[session_id]
743
+
744
+ if 'floor_frame' not in session:
745
+ raise HTTPException(status_code=400, detail="No floor frame found")
746
+
747
+ # For now, we'll use a default audio path based on common floor types
748
+ # In a real implementation, this could use LLM vision to analyze the floor
749
+ # and select the appropriate audio file
750
+
751
+ # Default audio paths to try
752
+ audio_paths = [
753
+ 'audio/Footsteps on Gravel Path Outdoor.mp3'
754
+ ]
755
+
756
+ audio_path = None
757
+ for path in audio_paths:
758
+ if os.path.exists(path):
759
+ audio_path = path
760
+ break
761
+
762
+ if not audio_path:
763
+ raise HTTPException(
764
+ status_code=404,
765
+ detail=f"No audio file found. Please ensure audio files exist in backend/audio/ directory. Searched: {audio_paths}"
766
+ )
767
+
768
+ # Store audio path in session for later use
769
+ session['audio_path'] = audio_path
770
+ session['audio_ready'] = True
771
+ session['surface_type'] = 'concrete' # Default, could be enhanced with LLM analysis
772
+
773
+ return {
774
+ "session_id": session_id,
775
+ "message": "Audio generated successfully",
776
+ "surface_type": session['surface_type'],
777
+ "audio_ready": True
778
+ }
779
+
780
+
781
+ @app.post("/api/live/stop-session/{session_id}")
782
+ async def stop_live_session(session_id: str):
783
+ """Stop live detection session and cleanup resources"""
784
+ if session_id not in tasks_storage:
785
+ raise HTTPException(status_code=404, detail="Session not found")
786
+
787
+ session = tasks_storage[session_id]
788
+
789
+ # Stop detector if exists
790
+ if 'detector' in session:
791
+ try:
792
+ detector = session['detector']
793
+ detector.stop()
794
+ del session['detector']
795
+ except Exception as e:
796
+ print(f"Error stopping detector: {e}")
797
+
798
+ # Cleanup floor frame
799
+ if 'floor_frame' in session:
800
+ try:
801
+ if os.path.exists(session['floor_frame']):
802
+ os.remove(session['floor_frame'])
803
+ except Exception as e:
804
+ print(f"Error removing floor frame: {e}")
805
+
806
+ # Get stats before deletion
807
+ detection_count = session.get('detection_count', 0)
808
+ last_detection = session.get('last_detection', None)
809
+
810
+ # Remove session
811
+ del tasks_storage[session_id]
812
+
813
+ return {
814
+ "session_id": session_id,
815
+ "message": "Session stopped",
816
+ "stats": {
817
+ "detection_count": detection_count,
818
+ "last_detection": last_detection
819
+ }
820
+ }
821
+
822
+
823
+ '''if __name__ == "__main__":
824
+ import uvicorn
825
+
826
+ uvicorn.run(app, host="0.0.0.0", port=8000)
827
+ '''
828
+
qsec.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+
4
+ def extract_second_audio_librosa(file_path, target_second=0, sample_rate=22050):
5
+ try:
6
+ # Load audio file
7
+ audio_data, sr = librosa.load(file_path, sr=sample_rate)
8
+
9
+ # Calculate start and end samples for the target second
10
+ start_sample = target_second * sr
11
+ end_sample = (target_second + 1) * sr
12
+
13
+ # Ensure we don't go beyond the audio length
14
+ if start_sample >= len(audio_data):
15
+ raise ValueError(f"Target second {target_second} is beyond audio length")
16
+
17
+ end_sample = min(end_sample, len(audio_data))
18
+
19
+ # Extract the second
20
+ second_audio = audio_data[start_sample:end_sample]
21
+
22
+ # If the audio is shorter than 1 second, pad with zeros
23
+ if len(second_audio) < sr:
24
+ second_audio = np.pad(second_audio, (0, sr - len(second_audio)))
25
+
26
+ return second_audio, sr
27
+
28
+ except Exception as e:
29
+ print(f"Error processing audio: {e}")
30
+ return None, None
31
+
real.py ADDED
@@ -0,0 +1,1572 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import cv2
4
+ import numpy as np
5
+ import mediapipe as mp
6
+ from pathlib import Path
7
+ from scipy.signal import find_peaks, savgol_filter
8
+ import json
9
+ import subprocess
10
+ import os
11
+ import soundfile as sf
12
+ from datetime import datetime
13
+ import tempfile
14
+ from ultralytics import YOLO
15
+ from agent import process_video_for_footstep_audio
16
+ from sound_agent import main_sound
17
+ from qsec import extract_second_audio_librosa
18
+ import threading
19
+ import queue
20
+ import time
21
+ from PIL import Image
22
+ import io
23
+
24
+ # Suppress TensorFlow warnings
25
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
26
+ import absl.logging
27
+
28
+ absl.logging.set_verbosity(absl.logging.ERROR)
29
+
30
+
31
+ def get_ffmpeg_path():
32
+ """Get FFmpeg path with multiple fallback options"""
33
+ possible_paths = [
34
+ "ffmpeg", # Try system ffmpeg first (Docker/Linux)
35
+ r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe", # Local Windows path
36
+ "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe", # Relative path
37
+ ]
38
+
39
+ for path in possible_paths:
40
+ if path == "ffmpeg":
41
+ try:
42
+ result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
43
+ if result.returncode == 0:
44
+ return path
45
+ except:
46
+ continue
47
+ else:
48
+ if os.path.exists(path):
49
+ return path
50
+ return "ffmpeg" # Default to system ffmpeg
51
+
52
+
53
+ FFMPEG_PATH = get_ffmpeg_path()
54
+
55
+ # Streamlit Configuration
56
+ st.set_page_config(
57
+ page_title="Hybrid YOLO-MediaPipe Footstep Detection",
58
+ page_icon="🎬",
59
+ layout="wide",
60
+ initial_sidebar_state="expanded"
61
+ )
62
+
63
+ st.markdown("""
64
+ <style>
65
+ .main-header {
66
+ font-size: 2.5rem;
67
+ font-weight: 700;
68
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
69
+ -webkit-background-clip: text;
70
+ -webkit-text-fill-color: transparent;
71
+ margin-bottom: 2rem;
72
+ }
73
+ .metric-card {
74
+ background: #f0f2f6;
75
+ padding: 1rem;
76
+ border-radius: 0.5rem;
77
+ border-left: 4px solid #667eea;
78
+ }
79
+ .success-box {
80
+ padding: 1rem;
81
+ background: #d4edda;
82
+ border: 1px solid #c3e6cb;
83
+ border-radius: 0.5rem;
84
+ color: #155724;
85
+ }
86
+ .hybrid-badge {
87
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
88
+ color: white;
89
+ padding: 0.5rem 1rem;
90
+ border-radius: 20px;
91
+ display: inline-block;
92
+ font-weight: 600;
93
+ margin: 1rem 0;
94
+ }
95
+ .live-indicator {
96
+ background: #dc3545;
97
+ color: white;
98
+ padding: 0.5rem 1rem;
99
+ border-radius: 20px;
100
+ display: inline-block;
101
+ font-weight: 600;
102
+ animation: pulse 1.5s infinite;
103
+ }
104
+ @keyframes pulse {
105
+ 0%, 100% { opacity: 1; }
106
+ 50% { opacity: 0.5; }
107
+ }
108
+ .ready-badge {
109
+ background: #28a745;
110
+ color: white;
111
+ padding: 0.5rem 1rem;
112
+ border-radius: 20px;
113
+ display: inline-block;
114
+ font-weight: 600;
115
+ }
116
+ </style>
117
+ """, unsafe_allow_html=True)
118
+
119
+
120
+ class LiveFootstepDetector:
121
+ """Real-time footstep detection for live camera feed"""
122
+
123
+ def __init__(self, audio_path, sensitivity='medium', yolo_conf=0.5):
124
+ self.audio_path = audio_path
125
+ self.sensitivity = sensitivity
126
+ self.yolo_conf = yolo_conf
127
+ self.running = False
128
+ self.audio_ready = False
129
+
130
+ # Load footstep audio
131
+ try:
132
+ self.footstep_audio, self.sample_rate = extract_second_audio_librosa(
133
+ file_path=audio_path,
134
+ target_second=5,
135
+ sample_rate=44100
136
+ )
137
+ self.audio_ready = True
138
+ except Exception as e:
139
+ st.error(f"Failed to load audio: {str(e)}")
140
+ self.audio_ready = False
141
+
142
+ # Initialize detection models
143
+ try:
144
+ self.yolo_model = YOLO('yolov8n.pt')
145
+ self.mp_pose = mp.solutions.pose
146
+ self.pose = self.mp_pose.Pose(
147
+ static_image_mode=False,
148
+ model_complexity=1,
149
+ smooth_landmarks=True,
150
+ min_detection_confidence=0.5,
151
+ min_tracking_confidence=0.5
152
+ )
153
+ except Exception as e:
154
+ st.error(f"Failed to initialize models: {str(e)}")
155
+ return
156
+
157
+ # Landmark indices
158
+ self.LEFT_HEEL = 29
159
+ self.RIGHT_HEEL = 30
160
+
161
+ # Detection thresholds
162
+ self.thresholds = {
163
+ 'low': {'prominence': 0.02, 'velocity_threshold': 0.015},
164
+ 'medium': {'prominence': 0.015, 'velocity_threshold': 0.012},
165
+ 'high': {'prominence': 0.01, 'velocity_threshold': 0.010}
166
+ }[sensitivity]
167
+
168
+ # Tracking state
169
+ self.prev_left_y = None
170
+ self.prev_right_y = None
171
+ self.prev_time = None
172
+ self.left_buffer = []
173
+ self.right_buffer = []
174
+ self.buffer_size = 10
175
+
176
+ # Audio playback
177
+ self.audio_queue = queue.Queue()
178
+ self.audio_thread = None
179
+
180
+ def start_audio_playback(self):
181
+ """Start audio playback thread"""
182
+ if not self.audio_ready:
183
+ return
184
+
185
+ def play_audio():
186
+ import pyaudio
187
+ p = pyaudio.PyAudio()
188
+ stream = p.open(
189
+ format=pyaudio.paFloat32,
190
+ channels=1,
191
+ rate=self.sample_rate,
192
+ output=True
193
+ )
194
+
195
+ while self.running:
196
+ try:
197
+ foot = self.audio_queue.get(timeout=0.1)
198
+ # Play footstep sound
199
+ stream.write(self.footstep_audio.astype(np.float32).tobytes())
200
+ except queue.Empty:
201
+ continue
202
+ except Exception as e:
203
+ print(f"Audio playback error: {e}")
204
+
205
+ stream.stop_stream()
206
+ stream.close()
207
+ p.terminate()
208
+
209
+ self.audio_thread = threading.Thread(target=play_audio, daemon=True)
210
+ self.audio_thread.start()
211
+
212
+ def detect_heel_strike(self, current_y, prev_y, foot_buffer):
213
+ """Detect heel strike based on vertical velocity and position"""
214
+ if prev_y is None:
215
+ return False
216
+
217
+ # Calculate vertical velocity (downward is positive)
218
+ velocity = current_y - prev_y
219
+
220
+ # Add to buffer
221
+ foot_buffer.append(current_y)
222
+ if len(foot_buffer) > self.buffer_size:
223
+ foot_buffer.pop(0)
224
+
225
+ if len(foot_buffer) < 5:
226
+ return False
227
+
228
+ # Detect strike: downward movement followed by stabilization
229
+ # Current position is low (heel on ground)
230
+ # Recent movement was downward
231
+ # Velocity is slowing (strike impact)
232
+ recent_velocities = [foot_buffer[i + 1] - foot_buffer[i]
233
+ for i in range(len(foot_buffer) - 1)]
234
+
235
+ avg_velocity = np.mean(recent_velocities[-3:]) if len(recent_velocities) >= 3 else 0
236
+
237
+ is_strike = (
238
+ current_y > 0.7 and # Heel is low in frame
239
+ velocity > self.thresholds['velocity_threshold'] and # Moving down
240
+ avg_velocity < velocity * 0.5 # Velocity decreasing (impact)
241
+ )
242
+
243
+ return is_strike
244
+
245
+ def process_frame(self, frame):
246
+ """Process single frame and detect footsteps"""
247
+ if not self.audio_ready:
248
+ return frame, None
249
+
250
+ detected_foot = None
251
+
252
+ try:
253
+ # YOLO detection
254
+ results = self.yolo_model(frame, conf=self.yolo_conf, classes=[0], verbose=False)
255
+
256
+ person_detected = False
257
+ bbox = None
258
+
259
+ for result in results:
260
+ boxes = result.boxes
261
+ if len(boxes) > 0:
262
+ person_detected = True
263
+ box = boxes[0] # Take first person
264
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
265
+ bbox = (int(x1), int(y1), int(x2), int(y2))
266
+
267
+ # Draw YOLO bbox
268
+ cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
269
+ (255, 255, 0), 2)
270
+ break
271
+
272
+ # MediaPipe pose estimation
273
+ if person_detected and bbox:
274
+ # Crop to person region with padding
275
+ x1, y1, x2, y2 = bbox
276
+ pad = 20
277
+ x1 = max(0, x1 - pad)
278
+ y1 = max(0, y1 - pad)
279
+ x2 = min(frame.shape[1], x2 + pad)
280
+ y2 = min(frame.shape[0], y2 + pad)
281
+
282
+ cropped = frame[y1:y2, x1:x2]
283
+ rgb_frame = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
284
+ pose_results = self.pose.process(rgb_frame)
285
+
286
+ if pose_results.pose_landmarks:
287
+ landmarks = pose_results.pose_landmarks.landmark
288
+
289
+ # Get heel positions (adjusted to full frame)
290
+ left_heel = landmarks[self.LEFT_HEEL]
291
+ right_heel = landmarks[self.RIGHT_HEEL]
292
+
293
+ left_y = (left_heel.y * (y2 - y1) + y1) / frame.shape[0]
294
+ right_y = (right_heel.y * (y2 - y1) + y1) / frame.shape[0]
295
+
296
+ # Detect strikes
297
+ left_strike = self.detect_heel_strike(
298
+ left_y, self.prev_left_y, self.left_buffer
299
+ )
300
+ right_strike = self.detect_heel_strike(
301
+ right_y, self.prev_right_y, self.right_buffer
302
+ )
303
+
304
+ if left_strike:
305
+ detected_foot = 'LEFT'
306
+ self.audio_queue.put('LEFT')
307
+ elif right_strike:
308
+ detected_foot = 'RIGHT'
309
+ self.audio_queue.put('RIGHT')
310
+
311
+ # Update previous positions
312
+ self.prev_left_y = left_y
313
+ self.prev_right_y = right_y
314
+
315
+ # Draw skeleton on full frame
316
+ for landmark in landmarks:
317
+ x = int((landmark.x * (x2 - x1) + x1))
318
+ y = int((landmark.y * (y2 - y1) + y1))
319
+ cv2.circle(frame, (x, y), 3, (0, 255, 0), -1)
320
+
321
+ # Highlight heels
322
+ left_heel_x = int((left_heel.x * (x2 - x1) + x1))
323
+ left_heel_y = int((left_heel.y * (y2 - y1) + y1))
324
+ right_heel_x = int((right_heel.x * (x2 - x1) + x1))
325
+ right_heel_y = int((right_heel.y * (y2 - y1) + y1))
326
+
327
+ cv2.circle(frame, (left_heel_x, left_heel_y), 8, (0, 255, 0), -1)
328
+ cv2.circle(frame, (right_heel_x, right_heel_y), 8, (0, 100, 255), -1)
329
+
330
+ if detected_foot:
331
+ # Show strike indicator
332
+ heel_x = left_heel_x if detected_foot == 'LEFT' else right_heel_x
333
+ heel_y = left_heel_y if detected_foot == 'LEFT' else right_heel_y
334
+ color = (0, 255, 0) if detected_foot == 'LEFT' else (0, 100, 255)
335
+
336
+ cv2.circle(frame, (heel_x, heel_y), 30, color, 3)
337
+ cv2.putText(frame, f"{detected_foot} STRIKE!",
338
+ (heel_x - 50, heel_y - 40),
339
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
340
+
341
+ # Draw status
342
+ status_text = "READY" if self.audio_ready else "NO AUDIO"
343
+ status_color = (0, 255, 0) if self.audio_ready else (0, 0, 255)
344
+ cv2.rectangle(frame, (10, 10), (150, 50), (0, 0, 0), -1)
345
+ cv2.putText(frame, status_text, (20, 35),
346
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, status_color, 2)
347
+
348
+ except Exception as e:
349
+ print(f"Frame processing error: {e}")
350
+
351
+ return frame, detected_foot
352
+
353
+ def start(self):
354
+ """Start the detector"""
355
+ self.running = True
356
+ self.start_audio_playback()
357
+
358
+ def stop(self):
359
+ """Stop the detector"""
360
+ self.running = False
361
+ if self.audio_thread:
362
+ self.audio_thread.join(timeout=2)
363
+
364
+
365
+ class HybridFootstepDetectionPipeline:
366
+ """
367
+ Hybrid Detection Pipeline for video files:
368
+ 1. YOLO detects person bounding boxes
369
+ 2. MediaPipe estimates pose on detected regions
370
+ 3. Track footsteps with improved accuracy
371
+ """
372
+
373
+ def __init__(self, fps=30, sensitivity='medium', yolo_conf=0.5):
374
+ self.fps = fps
375
+ self.sensitivity = sensitivity
376
+ self.yolo_conf = yolo_conf
377
+
378
+ # Initialize YOLO detector
379
+ try:
380
+ self.yolo_model = YOLO('yolov8n.pt')
381
+ st.success("βœ… YOLO detector loaded successfully")
382
+ except Exception as e:
383
+ st.warning(f"⚠️ YOLO loading issue: {str(e)}. Downloading model...")
384
+ try:
385
+ self.yolo_model = YOLO('yolov8n.pt')
386
+ st.success("βœ… YOLO detector loaded successfully")
387
+ except Exception as e2:
388
+ st.error(f"❌ Failed to load YOLO: {str(e2)}")
389
+ self.yolo_model = None
390
+
391
+ # Initialize MediaPipe pose estimator
392
+ try:
393
+ self.mp_pose = mp.solutions.pose
394
+ self.pose = self.mp_pose.Pose(
395
+ static_image_mode=False,
396
+ model_complexity=1,
397
+ smooth_landmarks=True,
398
+ min_detection_confidence=0.5,
399
+ min_tracking_confidence=0.5
400
+ )
401
+ st.success("βœ… MediaPipe pose estimator loaded successfully")
402
+ except Exception as e:
403
+ st.error(f"❌ Failed to initialize MediaPipe: {str(e)}")
404
+ self.pose = None
405
+
406
+ # Landmark indices
407
+ self.LEFT_HEEL = 29
408
+ self.RIGHT_HEEL = 30
409
+ self.LEFT_ANKLE = 27
410
+ self.RIGHT_ANKLE = 28
411
+
412
+ # Detection thresholds
413
+ self.thresholds = {
414
+ 'low': {'prominence': 0.02, 'min_interval': 0.4},
415
+ 'medium': {'prominence': 0.015, 'min_interval': 0.3},
416
+ 'high': {'prominence': 0.01, 'min_interval': 0.25}
417
+ }[sensitivity]
418
+
419
+ # Tracking state
420
+ self.person_tracker = PersonTracker()
421
+
422
+ def detect_person_yolo(self, frame):
423
+ """Detect person using YOLO"""
424
+ if self.yolo_model is None:
425
+ return []
426
+
427
+ try:
428
+ results = self.yolo_model(frame, conf=self.yolo_conf, classes=[0], verbose=False)
429
+
430
+ person_boxes = []
431
+ for result in results:
432
+ boxes = result.boxes
433
+ for box in boxes:
434
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
435
+ conf = box.conf[0].cpu().numpy()
436
+ person_boxes.append((int(x1), int(y1), int(x2), int(y2), float(conf)))
437
+
438
+ return person_boxes
439
+ except Exception as e:
440
+ st.warning(f"YOLO detection failed: {str(e)}")
441
+ return []
442
+
443
+ def estimate_pose_mediapipe(self, frame, bbox=None):
444
+ """Estimate pose using MediaPipe on specified region"""
445
+ if self.pose is None:
446
+ return None
447
+
448
+ try:
449
+ if bbox is not None:
450
+ x1, y1, x2, y2 = bbox
451
+ pad = 20
452
+ x1 = max(0, x1 - pad)
453
+ y1 = max(0, y1 - pad)
454
+ x2 = min(frame.shape[1], x2 + pad)
455
+ y2 = min(frame.shape[0], y2 + pad)
456
+
457
+ cropped = frame[y1:y2, x1:x2]
458
+ if cropped.size == 0:
459
+ return None
460
+
461
+ rgb_frame = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
462
+ results = self.pose.process(rgb_frame)
463
+
464
+ if results.pose_landmarks:
465
+ for landmark in results.pose_landmarks.landmark:
466
+ landmark.x = (landmark.x * (x2 - x1) + x1) / frame.shape[1]
467
+ landmark.y = (landmark.y * (y2 - y1) + y1) / frame.shape[0]
468
+
469
+ return results
470
+ else:
471
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
472
+ return self.pose.process(rgb_frame)
473
+
474
+ except Exception as e:
475
+ return None
476
+
477
+ def process_video(self, video_path, progress_callback=None):
478
+ """Process video with hybrid YOLO-MediaPipe pipeline"""
479
+
480
+ if self.yolo_model is None or self.pose is None:
481
+ st.error("❌ Detection models not available")
482
+ return None
483
+
484
+ cap = cv2.VideoCapture(str(video_path))
485
+ if not cap.isOpened():
486
+ st.error("❌ Could not open video file")
487
+ return None
488
+
489
+ fps = cap.get(cv2.CAP_PROP_FPS)
490
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
491
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
492
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
493
+
494
+ if fps <= 0 or total_frames <= 0:
495
+ st.error("❌ Invalid video properties")
496
+ cap.release()
497
+ return None
498
+
499
+ left_positions = []
500
+ right_positions = []
501
+ detection_confidence = []
502
+ frame_idx = 0
503
+
504
+ yolo_detections = 0
505
+ pose_detections = 0
506
+
507
+ st.info(f"πŸ”„ Processing with Hybrid Pipeline: {total_frames} frames")
508
+
509
+ try:
510
+ while cap.isOpened():
511
+ ret, frame = cap.read()
512
+ if not ret:
513
+ break
514
+
515
+ person_boxes = self.detect_person_yolo(frame)
516
+
517
+ if person_boxes:
518
+ yolo_detections += 1
519
+ best_box = self.person_tracker.select_best_person(person_boxes, frame_idx)
520
+ bbox = best_box[:4]
521
+ results = self.estimate_pose_mediapipe(frame, bbox)
522
+
523
+ if results and results.pose_landmarks:
524
+ pose_detections += 1
525
+ landmarks = results.pose_landmarks.landmark
526
+
527
+ left_y = landmarks[self.LEFT_HEEL].y
528
+ right_y = landmarks[self.RIGHT_HEEL].y
529
+ conf = (landmarks[self.LEFT_HEEL].visibility +
530
+ landmarks[self.RIGHT_HEEL].visibility) / 2
531
+
532
+ left_positions.append(left_y)
533
+ right_positions.append(right_y)
534
+ detection_confidence.append(conf)
535
+ else:
536
+ left_positions.append(np.nan)
537
+ right_positions.append(np.nan)
538
+ detection_confidence.append(0.0)
539
+ else:
540
+ results = self.estimate_pose_mediapipe(frame, bbox=None)
541
+
542
+ if results and results.pose_landmarks:
543
+ pose_detections += 1
544
+ landmarks = results.pose_landmarks.landmark
545
+
546
+ left_positions.append(landmarks[self.LEFT_HEEL].y)
547
+ right_positions.append(landmarks[self.RIGHT_HEEL].y)
548
+ detection_confidence.append(0.5)
549
+ else:
550
+ left_positions.append(np.nan)
551
+ right_positions.append(np.nan)
552
+ detection_confidence.append(0.0)
553
+
554
+ frame_idx += 1
555
+
556
+ if progress_callback and frame_idx % 10 == 0:
557
+ progress = min(frame_idx / total_frames, 1.0)
558
+ progress_callback(progress)
559
+
560
+ except Exception as e:
561
+ st.error(f"❌ Video processing error: {str(e)}")
562
+ cap.release()
563
+ return None
564
+
565
+ cap.release()
566
+
567
+ st.info(
568
+ f"πŸ“Š YOLO detections: {yolo_detections}/{total_frames} frames ({yolo_detections / total_frames * 100:.1f}%)")
569
+ st.info(
570
+ f"πŸ“Š Pose detections: {pose_detections}/{total_frames} frames ({pose_detections / total_frames * 100:.1f}%)")
571
+
572
+ if len(left_positions) == 0:
573
+ st.error("❌ No frames processed successfully")
574
+ return None
575
+
576
+ try:
577
+ left_series = pd.Series(left_positions).interpolate(method='linear')
578
+ left_series = left_series.bfill().ffill()
579
+ left_positions = left_series.values
580
+
581
+ right_series = pd.Series(right_positions).interpolate(method='linear')
582
+ right_series = right_series.bfill().ffill()
583
+ right_positions = right_series.values
584
+
585
+ if len(left_positions) > 5:
586
+ window = min(11, len(left_positions) if len(left_positions) % 2 == 1 else len(left_positions) - 1)
587
+ if window >= 3:
588
+ left_positions = savgol_filter(left_positions, window, 2)
589
+ right_positions = savgol_filter(right_positions, window, 2)
590
+
591
+ left_strikes = self._detect_strikes(left_positions, fps)
592
+ right_strikes = self._detect_strikes(right_positions, fps)
593
+
594
+ events = []
595
+
596
+ for frame in left_strikes:
597
+ events.append({
598
+ 'frame': int(frame),
599
+ 'timecode': self._frames_to_smpte(frame, fps),
600
+ 'foot': 'LEFT',
601
+ 'event': 'HEEL_STRIKE',
602
+ 'time_seconds': frame / fps,
603
+ 'confidence': detection_confidence[int(frame)] if int(frame) < len(detection_confidence) else 0.5
604
+ })
605
+
606
+ for frame in right_strikes:
607
+ events.append({
608
+ 'frame': int(frame),
609
+ 'timecode': self._frames_to_smpte(frame, fps),
610
+ 'foot': 'RIGHT',
611
+ 'event': 'HEEL_STRIKE',
612
+ 'time_seconds': frame / fps,
613
+ 'confidence': detection_confidence[int(frame)] if int(frame) < len(detection_confidence) else 0.5
614
+ })
615
+
616
+ events = sorted(events, key=lambda x: x['frame'])
617
+
618
+ return {
619
+ 'events': events,
620
+ 'fps': fps,
621
+ 'total_frames': total_frames,
622
+ 'width': width,
623
+ 'height': height,
624
+ 'left_positions': left_positions.tolist() if hasattr(left_positions, 'tolist') else left_positions,
625
+ 'right_positions': right_positions.tolist() if hasattr(right_positions, 'tolist') else right_positions,
626
+ 'detection_stats': {
627
+ 'yolo_detections': yolo_detections,
628
+ 'pose_detections': pose_detections,
629
+ 'total_frames': total_frames
630
+ }
631
+ }
632
+
633
+ except Exception as e:
634
+ st.error(f"❌ Data processing error: {str(e)}")
635
+ return None
636
+
637
+ def _detect_strikes(self, positions, fps):
638
+ """Detect heel strikes from position data"""
639
+ try:
640
+ peaks, _ = find_peaks(
641
+ positions,
642
+ prominence=self.thresholds['prominence'],
643
+ distance=int(fps * self.thresholds['min_interval']),
644
+ height=0.7
645
+ )
646
+ return peaks
647
+ except Exception as e:
648
+ st.warning(f"Peak detection failed: {str(e)}")
649
+ return np.array([])
650
+
651
+ def _frames_to_smpte(self, frame, fps):
652
+ """Convert frame number to SMPTE timecode"""
653
+ total_seconds = frame / fps
654
+ hours = int(total_seconds // 3600)
655
+ minutes = int((total_seconds % 3600) // 60)
656
+ seconds = int(total_seconds % 60)
657
+ frames = int((total_seconds * fps) % fps)
658
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}:{frames:02d}"
659
+
660
+
661
+ class PersonTracker:
662
+ """Track person across frames for consistency"""
663
+
664
+ def __init__(self, iou_threshold=0.3):
665
+ self.tracked_box = None
666
+ self.last_frame = -1
667
+ self.iou_threshold = iou_threshold
668
+
669
+ def calculate_iou(self, box1, box2):
670
+ """Calculate IoU between two bounding boxes"""
671
+ x1_1, y1_1, x2_1, y2_1 = box1[:4]
672
+ x1_2, y1_2, x2_2, y2_2 = box2[:4]
673
+
674
+ xi1 = max(x1_1, x1_2)
675
+ yi1 = max(y1_1, y1_2)
676
+ xi2 = min(x2_1, x2_2)
677
+ yi2 = min(y2_1, y2_2)
678
+
679
+ inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
680
+
681
+ box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
682
+ box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
683
+
684
+ union_area = box1_area + box2_area - inter_area
685
+
686
+ return inter_area / union_area if union_area > 0 else 0
687
+
688
+ def select_best_person(self, person_boxes, frame_idx):
689
+ """Select best person box for tracking consistency"""
690
+ if not person_boxes:
691
+ return None
692
+
693
+ if self.tracked_box is not None and frame_idx - self.last_frame < 10:
694
+ max_iou = 0
695
+ best_box = None
696
+
697
+ for box in person_boxes:
698
+ iou = self.calculate_iou(self.tracked_box, box)
699
+ if iou > max_iou:
700
+ max_iou = iou
701
+ best_box = box
702
+
703
+ if max_iou > self.iou_threshold:
704
+ self.tracked_box = best_box
705
+ self.last_frame = frame_idx
706
+ return best_box
707
+
708
+ best_box = max(person_boxes, key=lambda x: (x[2] - x[0]) * (x[3] - x[1]) * x[4])
709
+ self.tracked_box = best_box
710
+ self.last_frame = frame_idx
711
+ return best_box
712
+
713
+
714
+ class AudioGenerator:
715
+ """Generate footstep audio"""
716
+
717
+ def __init__(self, sample_rate=44100):
718
+ self.sample_rate = sample_rate
719
+
720
+ def generate_footstep(self, aud_path):
721
+ arr, rate = extract_second_audio_librosa(
722
+ file_path=aud_path,
723
+ target_second=5,
724
+ sample_rate=self.sample_rate
725
+ )
726
+ return arr
727
+
728
+ def create_audio_track(self, events, aud_path, duration=0.3):
729
+ total_samples = int(duration * self.sample_rate)
730
+ audio_track = np.zeros(total_samples, dtype=np.float32)
731
+
732
+ for i, event in enumerate(events):
733
+ step_sound = self.generate_footstep(aud_path)
734
+ pitch_shift = 1.0 + (i % 5 - 2) * 0.03
735
+ indices = np.arange(len(step_sound)) * pitch_shift
736
+ indices = np.clip(indices, 0, len(step_sound) - 1).astype(int)
737
+ step_sound = step_sound[indices]
738
+
739
+ start_sample = int(event['time_seconds'] * self.sample_rate)
740
+ end_sample = min(start_sample + len(step_sound), total_samples)
741
+ sound_len = end_sample - start_sample
742
+
743
+ if sound_len > 0:
744
+ audio_track[start_sample:end_sample] += step_sound[:sound_len]
745
+
746
+ max_val = np.max(np.abs(audio_track))
747
+ if max_val > 0:
748
+ audio_track = audio_track / max_val * 0.8
749
+
750
+ return audio_track
751
+
752
+
753
+ def create_annotated_video(input_path, events, output_path, use_hybrid=True, progress_callback=None):
754
+ """Create annotated video with hybrid detection visualization"""
755
+
756
+ try:
757
+ cap = cv2.VideoCapture(str(input_path))
758
+ if not cap.isOpened():
759
+ st.error("❌ Could not open input video file")
760
+ return False
761
+
762
+ fps = cap.get(cv2.CAP_PROP_FPS)
763
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
764
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
765
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
766
+
767
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
768
+ out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
769
+
770
+ if not out.isOpened():
771
+ st.error("❌ Could not create output video file")
772
+ cap.release()
773
+ return False
774
+
775
+ event_frames = {e['frame']: e for e in events}
776
+
777
+ if use_hybrid:
778
+ yolo_model = YOLO('yolov8n.pt')
779
+ mp_pose = mp.solutions.pose
780
+ pose = mp_pose.Pose(
781
+ static_image_mode=False,
782
+ model_complexity=1,
783
+ smooth_landmarks=True,
784
+ min_detection_confidence=0.5,
785
+ min_tracking_confidence=0.5
786
+ )
787
+ else:
788
+ yolo_model = None
789
+ mp_pose = mp.solutions.pose
790
+ pose = mp_pose.Pose(
791
+ static_image_mode=False,
792
+ model_complexity=1,
793
+ smooth_landmarks=True,
794
+ min_detection_confidence=0.5,
795
+ min_tracking_confidence=0.5
796
+ )
797
+
798
+ frame_idx = 0
799
+
800
+ while cap.isOpened():
801
+ ret, frame = cap.read()
802
+ if not ret:
803
+ break
804
+
805
+ try:
806
+ if use_hybrid and yolo_model:
807
+ results = yolo_model(frame, conf=0.5, classes=[0], verbose=False)
808
+ for result in results:
809
+ boxes = result.boxes
810
+ for box in boxes:
811
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
812
+ conf = box.conf[0].cpu().numpy()
813
+
814
+ cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)),
815
+ (255, 255, 0), 2)
816
+ cv2.putText(frame, f'YOLO: {conf:.2f}',
817
+ (int(x1), int(y1) - 10),
818
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2)
819
+
820
+ results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
821
+
822
+ if results.pose_landmarks:
823
+ mp.solutions.drawing_utils.draw_landmarks(
824
+ frame,
825
+ results.pose_landmarks,
826
+ mp_pose.POSE_CONNECTIONS,
827
+ landmark_drawing_spec=mp.solutions.drawing_utils.DrawingSpec(
828
+ color=(0, 255, 0), thickness=2, circle_radius=2
829
+ ),
830
+ connection_drawing_spec=mp.solutions.drawing_utils.DrawingSpec(
831
+ color=(255, 255, 255), thickness=2
832
+ )
833
+ )
834
+
835
+ if frame_idx in event_frames:
836
+ event = event_frames[frame_idx]
837
+
838
+ banner_height = 100
839
+ cv2.rectangle(frame, (0, 0), (width, banner_height), (0, 0, 0), -1)
840
+
841
+ text = f"{event['foot']} HEEL STRIKE"
842
+ color = (0, 255, 0) if event['foot'] == 'LEFT' else (0, 100, 255)
843
+
844
+ cv2.putText(frame, text, (50, 50),
845
+ cv2.FONT_HERSHEY_SIMPLEX, 1.5, color, 3)
846
+
847
+ if 'confidence' in event:
848
+ conf_text = f"Conf: {event['confidence']:.2f}"
849
+ cv2.putText(frame, conf_text, (50, 85),
850
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
851
+
852
+ circle_x = 50 if event['foot'] == 'LEFT' else width - 50
853
+ cv2.circle(frame, (circle_x, height - 100), 40, color, -1)
854
+
855
+ if use_hybrid:
856
+ cv2.rectangle(frame, (width - 250, 10), (width - 10, 50), (102, 126, 234), -1)
857
+ cv2.putText(frame, "HYBRID MODE", (width - 240, 35),
858
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
859
+
860
+ time_seconds = frame_idx / fps
861
+ hours = int(time_seconds // 3600)
862
+ minutes = int((time_seconds % 3600) // 60)
863
+ seconds = int(time_seconds % 60)
864
+ frame_num = int((time_seconds * fps) % fps)
865
+ timecode = f"TC: {hours:02d}:{minutes:02d}:{seconds:02d}:{frame_num:02d}"
866
+
867
+ cv2.rectangle(frame, (0, height - 80), (400, height), (0, 0, 0), -1)
868
+ cv2.putText(frame, timecode, (10, height - 30),
869
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
870
+ cv2.putText(frame, f"Frame: {frame_idx}/{total_frames}", (10, height - 55),
871
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
872
+
873
+ out.write(frame)
874
+ frame_idx += 1
875
+
876
+ if progress_callback and frame_idx % 5 == 0:
877
+ progress = min(frame_idx / total_frames, 1.0)
878
+ progress_callback(progress)
879
+
880
+ except Exception as e:
881
+ st.warning(f"⚠️ Error processing frame {frame_idx}: {str(e)}")
882
+ frame_idx += 1
883
+ continue
884
+
885
+ cap.release()
886
+ out.release()
887
+ pose.close()
888
+
889
+ return True
890
+
891
+ except Exception as e:
892
+ st.error(f"❌ Video annotation failed: {str(e)}")
893
+ try:
894
+ cap.release()
895
+ out.release()
896
+ pose.close()
897
+ except:
898
+ pass
899
+ return False
900
+
901
+
902
+ def merge_audio_with_video(video_path, audio_track, sample_rate, output_path):
903
+ """Merge audio with video using FFmpeg"""
904
+
905
+ temp_audio = tempfile.mktemp(suffix='.wav')
906
+ sf.write(temp_audio, audio_track, sample_rate)
907
+
908
+ ffmpeg_cmd = FFMPEG_PATH if FFMPEG_PATH else "ffmpeg"
909
+
910
+ cmd = [
911
+ ffmpeg_cmd, '-y',
912
+ '-i', str(video_path),
913
+ '-i', temp_audio,
914
+ '-map', '0:v', '-map', '1:a',
915
+ '-c:v', 'libx264', '-preset', 'medium',
916
+ '-c:a', 'aac', '-b:a', '192k',
917
+ '-shortest',
918
+ str(output_path)
919
+ ]
920
+
921
+ try:
922
+ if FFMPEG_PATH is None:
923
+ st.warning("FFmpeg not found. Using fallback method.")
924
+ return None
925
+
926
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=30)
927
+ return True
928
+
929
+ except subprocess.CalledProcessError as e:
930
+ st.error(f"FFmpeg error: {e.stderr}")
931
+ return False
932
+ except subprocess.TimeoutExpired:
933
+ st.error("FFmpeg timed out")
934
+ return False
935
+ finally:
936
+ if os.path.exists(temp_audio):
937
+ os.remove(temp_audio)
938
+
939
+
940
+ def live_streaming_mode():
941
+ """Live streaming mode with frame capture and real-time detection"""
942
+
943
+ st.markdown('<h2>πŸ“Ή Live Streaming Mode</h2>', unsafe_allow_html=True)
944
+ st.info("πŸŽ₯ This mode allows real-time footstep detection with your device camera")
945
+
946
+ # Initialize session state
947
+ if 'floor_frame_captured' not in st.session_state:
948
+ st.session_state.floor_frame_captured = False
949
+ if 'audio_downloaded' not in st.session_state:
950
+ st.session_state.audio_downloaded = False
951
+ if 'live_audio_path' not in st.session_state:
952
+ st.session_state.live_audio_path = None
953
+ if 'live_detector' not in st.session_state:
954
+ st.session_state.live_detector = None
955
+ if 'camera_active' not in st.session_state:
956
+ st.session_state.camera_active = False
957
+
958
+ # Step 1: Capture floor frame
959
+ st.markdown("### Step 1: Capture Floor Frame πŸ“Έ")
960
+ st.write("Capture a single frame showing the floor surface for audio analysis")
961
+
962
+ col1, col2 = st.columns([2, 1])
963
+
964
+ with col1:
965
+ # Camera input for frame capture
966
+ camera_image = st.camera_input("Capture floor image", key="floor_capture")
967
+
968
+ if camera_image is not None and not st.session_state.floor_frame_captured:
969
+ # Save captured frame
970
+ image = Image.open(camera_image)
971
+ temp_frame_path = tempfile.mktemp(suffix='.jpg')
972
+ image.save(temp_frame_path)
973
+ st.session_state.floor_frame_path = temp_frame_path
974
+
975
+ # Display captured frame
976
+ st.image(image, caption="Captured Floor Frame", use_container_width=True)
977
+
978
+ if st.button("βœ… Confirm Floor Capture", type="primary", use_container_width=True):
979
+ st.session_state.floor_frame_captured = True
980
+ st.success("βœ… Floor frame captured successfully!")
981
+ st.rerun()
982
+
983
+ with col2:
984
+ if st.session_state.floor_frame_captured:
985
+ st.markdown('<div class="success-box">βœ… Floor Captured</div>', unsafe_allow_html=True)
986
+ else:
987
+ st.info("πŸ“Έ Capture floor frame to proceed")
988
+
989
+ # Step 2: Analyze and download audio
990
+ if st.session_state.floor_frame_captured and not st.session_state.audio_downloaded:
991
+ st.markdown("---")
992
+ st.markdown("### Step 2: Analyze Floor & Download Audio πŸ”Š")
993
+
994
+ col1, col2 = st.columns([2, 1])
995
+
996
+ with col1:
997
+ if st.button("πŸ” Analyze Floor & Generate Audio", type="primary", use_container_width=True):
998
+ with st.spinner("πŸ”„ Analyzing floor surface and generating audio..."):
999
+ try:
1000
+ # Create temporary video from frame for processing
1001
+ temp_video = tempfile.mktemp(suffix='.mp4')
1002
+
1003
+ # Create 1-second video from the captured frame
1004
+ img = cv2.imread(st.session_state.floor_frame_path)
1005
+ height, width = img.shape[:2]
1006
+
1007
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
1008
+ out = cv2.VideoWriter(temp_video, fourcc, 30, (width, height))
1009
+
1010
+ # Write 30 frames (1 second at 30fps)
1011
+ for _ in range(30):
1012
+ out.write(img)
1013
+ out.release()
1014
+
1015
+ # Process video for footstep audio
1016
+ st.info("🎡 Generating footstep audio based on floor analysis...")
1017
+ aud_name = process_video_for_footstep_audio(temp_video)
1018
+ aud_dict = main_sound(aud_name)
1019
+ aud_path = aud_dict['default'].replace(".%(ext)s", ".mp3")
1020
+
1021
+ st.session_state.live_audio_path = aud_path
1022
+ st.session_state.audio_downloaded = True
1023
+
1024
+ # Clean up temp video
1025
+ if os.path.exists(temp_video):
1026
+ os.remove(temp_video)
1027
+
1028
+ st.success("βœ… Audio generated successfully!")
1029
+ st.balloons()
1030
+ st.rerun()
1031
+
1032
+ except Exception as e:
1033
+ st.error(f"❌ Error generating audio: {str(e)}")
1034
+
1035
+ with col2:
1036
+ st.info("🎡 Audio will be generated based on floor type")
1037
+
1038
+ # Step 3: Initialize live detector
1039
+ if st.session_state.audio_downloaded and st.session_state.live_detector is None:
1040
+ st.markdown("---")
1041
+ st.markdown("### Step 3: Initialize Live Detection πŸš€")
1042
+
1043
+ col1, col2 = st.columns([2, 1])
1044
+
1045
+ with col1:
1046
+ sensitivity = st.select_slider(
1047
+ "Detection Sensitivity",
1048
+ options=['low', 'medium', 'high'],
1049
+ value='medium'
1050
+ )
1051
+
1052
+ yolo_conf = st.slider(
1053
+ "YOLO Confidence",
1054
+ min_value=0.1,
1055
+ max_value=0.9,
1056
+ value=0.5,
1057
+ step=0.05
1058
+ )
1059
+
1060
+ if st.button("🎬 Initialize Live Detector", type="primary", use_container_width=True):
1061
+ with st.spinner("βš™οΈ Initializing detector..."):
1062
+ try:
1063
+ detector = LiveFootstepDetector(
1064
+ audio_path=st.session_state.live_audio_path,
1065
+ sensitivity=sensitivity,
1066
+ yolo_conf=yolo_conf
1067
+ )
1068
+ st.session_state.live_detector = detector
1069
+ st.success("βœ… Live detector initialized!")
1070
+ st.rerun()
1071
+ except Exception as e:
1072
+ st.error(f"❌ Failed to initialize detector: {str(e)}")
1073
+
1074
+ with col2:
1075
+ st.info("πŸ€– Configure detection parameters")
1076
+
1077
+ # Step 4: Start live detection
1078
+ if st.session_state.live_detector is not None:
1079
+ st.markdown("---")
1080
+ st.markdown('<div class="ready-badge">βœ… SYSTEM READY</div>', unsafe_allow_html=True)
1081
+ st.markdown("### Step 4: Live Detection 🎯")
1082
+
1083
+ col1, col2 = st.columns([3, 1])
1084
+
1085
+ with col1:
1086
+ st.write("πŸ“Ή **Camera is ready for live footstep detection**")
1087
+ st.write("🚢 Walk in front of the camera and hear footsteps in real-time!")
1088
+
1089
+ # Start/Stop controls
1090
+ col_a, col_b = st.columns(2)
1091
+
1092
+ with col_a:
1093
+ if not st.session_state.camera_active:
1094
+ if st.button("▢️ Start Live Detection", type="primary", use_container_width=True):
1095
+ st.session_state.camera_active = True
1096
+ st.session_state.live_detector.start()
1097
+ st.rerun()
1098
+
1099
+ with col_b:
1100
+ if st.session_state.camera_active:
1101
+ if st.button("⏹️ Stop Detection", type="secondary", use_container_width=True):
1102
+ st.session_state.camera_active = False
1103
+ st.session_state.live_detector.stop()
1104
+ st.rerun()
1105
+
1106
+ with col2:
1107
+ if st.session_state.camera_active:
1108
+ st.markdown('<div class="live-indicator">πŸ”΄ LIVE</div>', unsafe_allow_html=True)
1109
+ else:
1110
+ st.info("⏸️ Paused")
1111
+
1112
+ # Live video feed
1113
+ if st.session_state.camera_active:
1114
+ st.markdown("---")
1115
+
1116
+ FRAME_WINDOW = st.image([])
1117
+
1118
+ cap = cv2.VideoCapture(0)
1119
+
1120
+ if not cap.isOpened():
1121
+ st.error("❌ Cannot access camera. Please check permissions.")
1122
+ st.session_state.camera_active = False
1123
+ else:
1124
+ st.info("πŸ“Ή Live feed active - Walk to generate footsteps!")
1125
+
1126
+ # Statistics
1127
+ step_counter = st.empty()
1128
+ left_steps = 0
1129
+ right_steps = 0
1130
+
1131
+ try:
1132
+ while st.session_state.camera_active:
1133
+ ret, frame = cap.read()
1134
+
1135
+ if not ret:
1136
+ st.error("❌ Failed to read from camera")
1137
+ break
1138
+
1139
+ # Process frame
1140
+ processed_frame, detected_foot = st.session_state.live_detector.process_frame(frame)
1141
+
1142
+ # Update counters
1143
+ if detected_foot == 'LEFT':
1144
+ left_steps += 1
1145
+ elif detected_foot == 'RIGHT':
1146
+ right_steps += 1
1147
+
1148
+ # Display frame
1149
+ FRAME_WINDOW.image(cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB))
1150
+
1151
+ # Update statistics
1152
+ step_counter.metric("Total Steps Detected", left_steps + right_steps,
1153
+ f"L: {left_steps} | R: {right_steps}")
1154
+
1155
+ # Check if user stopped
1156
+ if not st.session_state.camera_active:
1157
+ break
1158
+
1159
+ time.sleep(0.033) # ~30 FPS
1160
+
1161
+ except Exception as e:
1162
+ st.error(f"❌ Error during live detection: {str(e)}")
1163
+
1164
+ finally:
1165
+ cap.release()
1166
+ st.session_state.live_detector.stop()
1167
+
1168
+ # Reset button
1169
+ st.markdown("---")
1170
+ if st.button("πŸ”„ Reset All", use_container_width=True):
1171
+ st.session_state.floor_frame_captured = False
1172
+ st.session_state.audio_downloaded = False
1173
+ st.session_state.live_audio_path = None
1174
+ st.session_state.live_detector = None
1175
+ st.session_state.camera_active = False
1176
+ st.rerun()
1177
+
1178
+
1179
+ def video_upload_mode():
1180
+ """Original video upload mode"""
1181
+
1182
+ st.markdown('<h2>πŸ“€ Video Upload Mode</h2>', unsafe_allow_html=True)
1183
+
1184
+ # Sidebar configuration
1185
+ sensitivity = st.sidebar.select_slider(
1186
+ "Footstep Sensitivity",
1187
+ options=['low', 'medium', 'high'],
1188
+ value='medium',
1189
+ help="Higher sensitivity detects more subtle footsteps"
1190
+ )
1191
+
1192
+ yolo_conf = st.sidebar.slider(
1193
+ "YOLO Confidence",
1194
+ min_value=0.1,
1195
+ max_value=0.9,
1196
+ value=0.5,
1197
+ step=0.05,
1198
+ help="Confidence threshold for YOLO person detection"
1199
+ )
1200
+
1201
+ surface_type = st.sidebar.selectbox(
1202
+ "Surface Type",
1203
+ ['concrete', 'wood', 'grass', 'gravel', 'metal'],
1204
+ help="Select surface for audio generation"
1205
+ )
1206
+
1207
+ use_hybrid = st.sidebar.checkbox(
1208
+ "Enable Hybrid Mode",
1209
+ value=True,
1210
+ help="Use YOLO for person detection + MediaPipe for pose estimation"
1211
+ )
1212
+
1213
+ create_annotated = st.sidebar.checkbox("Create Annotated Video", value=True)
1214
+ add_audio = st.sidebar.checkbox("Add Footstep Audio", value=True)
1215
+
1216
+ # File uploader
1217
+ uploaded_file = st.file_uploader(
1218
+ "πŸ“€ Upload Video File",
1219
+ type=['mp4', 'avi', 'mov', 'mkv'],
1220
+ help="Upload a video file to detect footsteps"
1221
+ )
1222
+
1223
+ if uploaded_file:
1224
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
1225
+ tmp_file.write(uploaded_file.read())
1226
+ video_path = tmp_file.name
1227
+
1228
+ col1, col2 = st.columns([2, 1])
1229
+
1230
+ with col1:
1231
+ st.subheader("πŸ“Ή Input Video")
1232
+ st.video(video_path)
1233
+
1234
+ with col2:
1235
+ st.subheader("ℹ️ Video Info")
1236
+ cap = cv2.VideoCapture(video_path)
1237
+ video_info = {
1238
+ "Duration": f"{cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS):.2f}s",
1239
+ "FPS": f"{cap.get(cv2.CAP_PROP_FPS):.2f}",
1240
+ "Resolution": f"{int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))}x{int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))}",
1241
+ "Frames": int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
1242
+ }
1243
+ cap.release()
1244
+
1245
+ for key, value in video_info.items():
1246
+ st.metric(key, value)
1247
+
1248
+ if use_hybrid:
1249
+ st.success("πŸ€– Hybrid Mode Active")
1250
+ else:
1251
+ st.info("πŸ“Š MediaPipe Only")
1252
+
1253
+ st.markdown("---")
1254
+
1255
+ if st.button("πŸš€ Process Video", type="primary", use_container_width=True):
1256
+
1257
+ if use_hybrid:
1258
+ st.info("πŸ”„ Running Hybrid YOLO-MediaPipe Pipeline...")
1259
+ pipeline = HybridFootstepDetectionPipeline(
1260
+ fps=float(video_info["FPS"]),
1261
+ sensitivity=sensitivity,
1262
+ yolo_conf=yolo_conf
1263
+ )
1264
+ else:
1265
+ st.info("πŸ”„ Running MediaPipe-Only Pipeline...")
1266
+ pipeline = HybridFootstepDetectionPipeline(
1267
+ fps=float(video_info["FPS"]),
1268
+ sensitivity=sensitivity,
1269
+ yolo_conf=yolo_conf
1270
+ )
1271
+
1272
+ with st.spinner("πŸ” Detecting footsteps..."):
1273
+ progress_bar = st.progress(0)
1274
+ status_text = st.empty()
1275
+
1276
+ def update_progress(val):
1277
+ progress_bar.progress(val)
1278
+ status_text.text(f"Processing: {int(val * 100)}%")
1279
+
1280
+ results = pipeline.process_video(video_path, update_progress)
1281
+ st.session_state['results'] = results
1282
+ st.session_state['video_path'] = video_path
1283
+ st.session_state['use_hybrid'] = use_hybrid
1284
+
1285
+ progress_bar.empty()
1286
+ status_text.empty()
1287
+
1288
+ if results:
1289
+ st.markdown('<div class="success-box">βœ… Footstep detection complete!</div>',
1290
+ unsafe_allow_html=True)
1291
+ st.success(f"Detected **{len(results['events'])}** footstep events")
1292
+
1293
+ if 'detection_stats' in results:
1294
+ stats = results['detection_stats']
1295
+ col1, col2, col3 = st.columns(3)
1296
+ col1.metric("YOLO Detections",
1297
+ f"{stats['yolo_detections']}/{stats['total_frames']}")
1298
+ col2.metric("Pose Detections",
1299
+ f"{stats['pose_detections']}/{stats['total_frames']}")
1300
+ col3.metric("Success Rate",
1301
+ f"{stats['pose_detections'] / stats['total_frames'] * 100:.1f}%")
1302
+
1303
+ # Display results (existing code continues...)
1304
+ if 'results' in st.session_state:
1305
+ results = st.session_state['results']
1306
+
1307
+ st.markdown("---")
1308
+ st.subheader("πŸ“Š Detection Results")
1309
+
1310
+ col1, col2, col3, col4 = st.columns(4)
1311
+
1312
+ left_count = len([e for e in results['events'] if e['foot'] == 'LEFT'])
1313
+ right_count = len([e for e in results['events'] if e['foot'] == 'RIGHT'])
1314
+ avg_cadence = len(results['events']) / (results['total_frames'] / results['fps']) * 60
1315
+ avg_conf = np.mean([e.get('confidence', 0.5) for e in results['events']])
1316
+
1317
+ col1.metric("Total Events", len(results['events']))
1318
+ col2.metric("Left Foot", left_count)
1319
+ col3.metric("Right Foot", right_count)
1320
+ col4.metric("Avg Confidence", f"{avg_conf:.2f}")
1321
+
1322
+ st.metric("Average Cadence", f"{avg_cadence:.1f} steps/min")
1323
+
1324
+ st.subheader("πŸ“‹ Detected Events")
1325
+ events_df = pd.DataFrame(results['events'])
1326
+
1327
+ if not events_df.empty:
1328
+ st.dataframe(
1329
+ events_df.style.apply(
1330
+ lambda x: ['background-color: #e8f5e9' if x.foot == 'LEFT'
1331
+ else 'background-color: #fff3e0' for _ in x],
1332
+ axis=1
1333
+ ),
1334
+ use_container_width=True,
1335
+ height=300
1336
+ )
1337
+
1338
+ st.subheader("πŸ’Ύ Export Options")
1339
+
1340
+ col1, col2, col3 = st.columns(3)
1341
+
1342
+ with col1:
1343
+ csv = events_df.to_csv(index=False)
1344
+ st.download_button(
1345
+ "πŸ“„ Download CSV",
1346
+ csv,
1347
+ f"footsteps_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
1348
+ "text/csv",
1349
+ use_container_width=True
1350
+ )
1351
+
1352
+ with col2:
1353
+ json_data = json.dumps(results['events'], indent=2)
1354
+ st.download_button(
1355
+ "πŸ“‹ Download JSON",
1356
+ json_data,
1357
+ f"footsteps_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
1358
+ "application/json",
1359
+ use_container_width=True
1360
+ )
1361
+
1362
+ with col3:
1363
+ timecode_text = "\n".join([
1364
+ f"{e['timecode']}\t{e['foot']}\t{e['event']}\t{e.get('confidence', 0.5):.2f}"
1365
+ for e in results['events']
1366
+ ])
1367
+ st.download_button(
1368
+ "⏱️ Download Timecode",
1369
+ timecode_text,
1370
+ f"timecode_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
1371
+ "text/plain",
1372
+ use_container_width=True
1373
+ )
1374
+
1375
+ st.markdown("---")
1376
+ st.subheader("πŸŽ₯ Generate Output Video")
1377
+
1378
+ col1, col2 = st.columns(2)
1379
+
1380
+ with col1:
1381
+ if create_annotated and st.button("Create Annotated Video", use_container_width=True):
1382
+ with st.spinner("Creating annotated video..."):
1383
+ annotated_path = tempfile.mktemp(suffix='_annotated.mp4')
1384
+ progress_bar = st.progress(0)
1385
+
1386
+ success = create_annotated_video(
1387
+ st.session_state['video_path'],
1388
+ results['events'],
1389
+ annotated_path,
1390
+ use_hybrid=st.session_state.get('use_hybrid', False),
1391
+ progress_callback=lambda v: progress_bar.progress(v)
1392
+ )
1393
+
1394
+ if success:
1395
+ st.session_state['annotated_video'] = annotated_path
1396
+ progress_bar.empty()
1397
+ st.success("βœ… Annotated video ready!")
1398
+ else:
1399
+ st.error("❌ Failed to create annotated video")
1400
+
1401
+ with col2:
1402
+ if add_audio and st.button("Generate with Audio", use_container_width=True):
1403
+ with st.spinner("Generating audio and merging..."):
1404
+ audio_gen = AudioGenerator()
1405
+ aud_name = process_video_for_footstep_audio(str(st.session_state['video_path']))
1406
+ aud_path = main_sound(aud_name)
1407
+ aud_path = aud_path['default'].replace(".%(ext)s", ".mp3")
1408
+ duration = results['total_frames'] / results['fps']
1409
+ audio_track = audio_gen.create_audio_track(
1410
+ results['events'],
1411
+ aud_path,
1412
+ duration
1413
+ )
1414
+
1415
+ temp_video = tempfile.mktemp(suffix='_temp.mp4')
1416
+ progress_bar = st.progress(0)
1417
+
1418
+ create_annotated_video(
1419
+ st.session_state['video_path'],
1420
+ results['events'],
1421
+ temp_video,
1422
+ use_hybrid=st.session_state.get('use_hybrid', False),
1423
+ progress_callback=lambda v: progress_bar.progress(v * 0.7)
1424
+ )
1425
+
1426
+ final_output = tempfile.mktemp(suffix='_final.mp4')
1427
+ success = merge_audio_with_video(
1428
+ temp_video,
1429
+ audio_track,
1430
+ 44100,
1431
+ final_output
1432
+ )
1433
+
1434
+ progress_bar.progress(1.0)
1435
+ progress_bar.empty()
1436
+
1437
+ if success:
1438
+ st.session_state['final_video'] = final_output
1439
+ st.success("βœ… Video with audio ready!")
1440
+ else:
1441
+ st.error("❌ Failed to merge audio")
1442
+
1443
+ if 'annotated_video' in st.session_state:
1444
+ st.markdown("---")
1445
+ st.subheader("πŸ“Ί Annotated Video")
1446
+ st.video(st.session_state['annotated_video'])
1447
+
1448
+ with open(st.session_state['annotated_video'], 'rb') as f:
1449
+ st.download_button(
1450
+ "πŸ“₯ Download Annotated Video",
1451
+ f,
1452
+ f"annotated_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4",
1453
+ "video/mp4",
1454
+ use_container_width=True
1455
+ )
1456
+
1457
+ if 'final_video' in st.session_state:
1458
+ st.markdown("---")
1459
+ st.subheader("πŸ”Š Final Video with Audio")
1460
+ st.video(st.session_state['final_video'])
1461
+
1462
+ with open(st.session_state['final_video'], 'rb') as f:
1463
+ st.download_button(
1464
+ "πŸ“₯ Download Final Video",
1465
+ f,
1466
+ f"final_with_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4",
1467
+ "video/mp4",
1468
+ use_container_width=True
1469
+ )
1470
+
1471
+
1472
+ def main():
1473
+ st.markdown('<h1 class="main-header">🎬 Hybrid YOLO-MediaPipe Footstep Detection</h1>',
1474
+ unsafe_allow_html=True)
1475
+ st.markdown('<div class="hybrid-badge">πŸš€ YOLO Person Detection + MediaPipe Pose Estimation</div>',
1476
+ unsafe_allow_html=True)
1477
+ st.markdown("### Advanced AI-Powered Foley Tool with Dual-Stage Detection Pipeline")
1478
+
1479
+ # Mode selection
1480
+ st.markdown("---")
1481
+ st.markdown("## 🎯 Select Mode")
1482
+
1483
+ col1, col2 = st.columns(2)
1484
+
1485
+ with col1:
1486
+ if st.button("πŸ“€ Video Upload Mode", use_container_width=True, type="primary"):
1487
+ st.session_state.mode = 'upload'
1488
+
1489
+ with col2:
1490
+ if st.button("πŸ“Ή Live Streaming Mode", use_container_width=True, type="primary"):
1491
+ st.session_state.mode = 'live'
1492
+
1493
+ # Initialize mode
1494
+ if 'mode' not in st.session_state:
1495
+ st.session_state.mode = 'upload'
1496
+
1497
+ st.markdown("---")
1498
+
1499
+ # Display selected mode
1500
+ if st.session_state.mode == 'upload':
1501
+ video_upload_mode()
1502
+ else:
1503
+ live_streaming_mode()
1504
+
1505
+ # Sidebar info
1506
+ with st.sidebar:
1507
+ st.markdown("---")
1508
+ st.markdown(f"### 🎯 Current Mode: **{st.session_state.mode.upper()}**")
1509
+
1510
+ if st.session_state.mode == 'live':
1511
+ st.markdown("---")
1512
+ st.markdown("### πŸ“Ή Live Mode Guide")
1513
+ st.markdown("""
1514
+ **Steps:**
1515
+ 1. πŸ“Έ **Capture Floor Frame**
1516
+ - Point camera at floor
1517
+ - Capture clear image
1518
+
1519
+ 2. πŸ”Š **Generate Audio**
1520
+ - AI analyzes floor type
1521
+ - Downloads matching sound
1522
+
1523
+ 3. βœ… **System Ready**
1524
+ - Real-time detection active
1525
+ - Walk and hear footsteps!
1526
+
1527
+ **Tips:**
1528
+ - Good lighting needed
1529
+ - Clear floor view
1530
+ - Stand 2-3 meters away
1531
+ - Walk naturally
1532
+ """)
1533
+
1534
+ st.markdown("---")
1535
+ st.markdown("### πŸ€– Hybrid Pipeline")
1536
+ st.markdown("""
1537
+ **Stage 1: YOLO Detection**
1538
+ - Detects person in frame
1539
+ - Provides bounding box
1540
+ - Tracks across frames
1541
+
1542
+ **Stage 2: MediaPipe Pose**
1543
+ - Estimates pose on detected region
1544
+ - Extracts heel landmarks
1545
+ - Higher accuracy & speed
1546
+
1547
+ **Benefits:**
1548
+ - βœ… More robust detection
1549
+ - βœ… Better occlusion handling
1550
+ - βœ… Faster processing
1551
+ - βœ… Improved accuracy
1552
+ """)
1553
+
1554
+ st.markdown("---")
1555
+ st.markdown("### ℹ️ System Info")
1556
+ st.markdown("""
1557
+ **Detection Engines:**
1558
+ - YOLOv8 (Person Detection)
1559
+ - MediaPipe Pose v2 (Pose Estimation)
1560
+
1561
+ **Features:**
1562
+ - Dual-stage AI pipeline
1563
+ - Person tracking
1564
+ - Frame-accurate timing
1565
+ - Confidence scoring
1566
+ - Real-time live detection
1567
+ - Autonomous audio generation
1568
+ """)
1569
+
1570
+
1571
+ if __name__ == "__main__":
1572
+ main()
reel.py ADDED
@@ -0,0 +1,1573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''aud_name = process_video_for_footstep_audio(temp_video)
2
+ aud_dict = main_sound(aud_name)
3
+ aud_path = aud_dict['default'].replace(".%(ext)s", ".mp3")'''
4
+
5
+ import pandas as pd
6
+ import streamlit as st
7
+ import cv2
8
+ import numpy as np
9
+ import mediapipe as mp
10
+ from pathlib import Path
11
+ from scipy.signal import find_peaks, savgol_filter
12
+ import json
13
+ import subprocess
14
+ import os
15
+ import soundfile as sf
16
+ from datetime import datetime
17
+ import tempfile
18
+ from ultralytics import YOLO
19
+ from agent import process_video_for_footstep_audio
20
+ from sound_agent import main_sound
21
+ from qsec import extract_second_audio_librosa
22
+ import threading
23
+ import queue
24
+ import time
25
+ from PIL import Image
26
+ import io
27
+
28
+ # Suppress TensorFlow warnings
29
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
30
+ import absl.logging
31
+
32
+ absl.logging.set_verbosity(absl.logging.ERROR)
33
+
34
+
35
+ def get_ffmpeg_path():
36
+ """Get FFmpeg path with multiple fallback options"""
37
+ possible_paths = [
38
+ "ffmpeg", # Try system ffmpeg first (Docker/Linux)
39
+ r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe", # Local Windows
40
+ "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe", # Relative path
41
+ ]
42
+
43
+ for path in possible_paths:
44
+ if path == "ffmpeg":
45
+ try:
46
+ result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
47
+ if result.returncode == 0:
48
+ return path
49
+ except:
50
+ continue
51
+ else:
52
+ if os.path.exists(path):
53
+ return path
54
+ return None
55
+
56
+
57
+ FFMPEG_PATH = get_ffmpeg_path()
58
+
59
+ # Streamlit Configuration
60
+ st.set_page_config(
61
+ page_title="Hybrid YOLO-MediaPipe Footstep Detection",
62
+ page_icon="🎬",
63
+ layout="wide",
64
+ initial_sidebar_state="expanded"
65
+ )
66
+
67
+ st.markdown("""
68
+ <style>
69
+ .main-header {
70
+ font-size: 2.5rem;
71
+ font-weight: 700;
72
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
73
+ -webkit-background-clip: text;
74
+ -webkit-text-fill-color: transparent;
75
+ margin-bottom: 2rem;
76
+ }
77
+ .metric-card {
78
+ background: #f0f2f6;
79
+ padding: 1rem;
80
+ border-radius: 0.5rem;
81
+ border-left: 4px solid #667eea;
82
+ }
83
+ .success-box {
84
+ padding: 1rem;
85
+ background: #d4edda;
86
+ border: 1px solid #c3e6cb;
87
+ border-radius: 0.5rem;
88
+ color: #155724;
89
+ }
90
+ .hybrid-badge {
91
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
92
+ color: white;
93
+ padding: 0.5rem 1rem;
94
+ border-radius: 20px;
95
+ display: inline-block;
96
+ font-weight: 600;
97
+ margin: 1rem 0;
98
+ }
99
+ .live-indicator {
100
+ background: #dc3545;
101
+ color: white;
102
+ padding: 0.5rem 1rem;
103
+ border-radius: 20px;
104
+ display: inline-block;
105
+ font-weight: 600;
106
+ animation: pulse 1.5s infinite;
107
+ }
108
+ @keyframes pulse {
109
+ 0%, 100% { opacity: 1; }
110
+ 50% { opacity: 0.5; }
111
+ }
112
+ .ready-badge {
113
+ background: #28a745;
114
+ color: white;
115
+ padding: 0.5rem 1rem;
116
+ border-radius: 20px;
117
+ display: inline-block;
118
+ font-weight: 600;
119
+ }
120
+ </style>
121
+ """, unsafe_allow_html=True)
122
+
123
+
124
+ class LiveFootstepDetector:
125
+ """Real-time footstep detection for live camera feed"""
126
+
127
+ def __init__(self, audio_path, sensitivity='medium', yolo_conf=0.5):
128
+ self.audio_path = audio_path
129
+ self.sensitivity = sensitivity
130
+ self.yolo_conf = yolo_conf
131
+ self.running = False
132
+ self.audio_ready = False
133
+
134
+ # Load footstep audio
135
+ try:
136
+ self.footstep_audio, self.sample_rate = extract_second_audio_librosa(
137
+ file_path=audio_path,
138
+ target_second=5,
139
+ sample_rate=44100
140
+ )
141
+ self.audio_ready = True
142
+ except Exception as e:
143
+ st.error(f"Failed to load audio: {str(e)}")
144
+ self.audio_ready = False
145
+
146
+ # Initialize detection models
147
+ try:
148
+ self.yolo_model = YOLO('yolov8n.pt')
149
+ self.mp_pose = mp.solutions.pose
150
+ self.pose = self.mp_pose.Pose(
151
+ static_image_mode=False,
152
+ model_complexity=1,
153
+ smooth_landmarks=True,
154
+ min_detection_confidence=0.5,
155
+ min_tracking_confidence=0.5
156
+ )
157
+ except Exception as e:
158
+ st.error(f"Failed to initialize models: {str(e)}")
159
+ return
160
+
161
+ # Landmark indices
162
+ self.LEFT_HEEL = 29
163
+ self.RIGHT_HEEL = 30
164
+
165
+ # Detection thresholds
166
+ self.thresholds = {
167
+ 'low': {'prominence': 0.02, 'velocity_threshold': 0.015},
168
+ 'medium': {'prominence': 0.015, 'velocity_threshold': 0.012},
169
+ 'high': {'prominence': 0.01, 'velocity_threshold': 0.010}
170
+ }[sensitivity]
171
+
172
+ # Tracking state
173
+ self.prev_left_y = None
174
+ self.prev_right_y = None
175
+ self.prev_time = None
176
+ self.left_buffer = []
177
+ self.right_buffer = []
178
+ self.buffer_size = 10
179
+
180
+ # Audio playback
181
+ self.audio_queue = queue.Queue()
182
+ self.audio_thread = None
183
+
184
+ def start_audio_playback(self):
185
+ """Start audio playback thread"""
186
+ if not self.audio_ready:
187
+ return
188
+
189
+ def play_audio():
190
+ import pyaudio
191
+ p = pyaudio.PyAudio()
192
+ stream = p.open(
193
+ format=pyaudio.paFloat32,
194
+ channels=1,
195
+ rate=self.sample_rate,
196
+ output=True
197
+ )
198
+
199
+ while self.running:
200
+ try:
201
+ foot = self.audio_queue.get(timeout=0.1)
202
+ # Play footstep sound
203
+ stream.write(self.footstep_audio.astype(np.float32).tobytes())
204
+ except queue.Empty:
205
+ continue
206
+ except Exception as e:
207
+ print(f"Audio playback error: {e}")
208
+
209
+ stream.stop_stream()
210
+ stream.close()
211
+ p.terminate()
212
+
213
+ self.audio_thread = threading.Thread(target=play_audio, daemon=True)
214
+ self.audio_thread.start()
215
+
216
+ def detect_heel_strike(self, current_y, prev_y, foot_buffer):
217
+ """Detect heel strike based on vertical velocity and position"""
218
+ if prev_y is None:
219
+ return False
220
+
221
+ # Calculate vertical velocity (downward is positive)
222
+ velocity = current_y - prev_y
223
+
224
+ # Add to buffer
225
+ foot_buffer.append(current_y)
226
+ if len(foot_buffer) > self.buffer_size:
227
+ foot_buffer.pop(0)
228
+
229
+ if len(foot_buffer) < 5:
230
+ return False
231
+
232
+ # Detect strike: downward movement followed by stabilization
233
+ # Current position is low (heel on ground)
234
+ # Recent movement was downward
235
+ # Velocity is slowing (strike impact)
236
+ recent_velocities = [foot_buffer[i + 1] - foot_buffer[i]
237
+ for i in range(len(foot_buffer) - 1)]
238
+
239
+ avg_velocity = np.mean(recent_velocities[-3:]) if len(recent_velocities) >= 3 else 0
240
+
241
+ is_strike = (
242
+ current_y > 0.7 and # Heel is low in frame
243
+ velocity > self.thresholds['velocity_threshold'] and # Moving down
244
+ avg_velocity < velocity * 0.5 # Velocity decreasing (impact)
245
+ )
246
+
247
+ return is_strike
248
+
249
+ def process_frame(self, frame):
250
+ """Process single frame and detect footsteps"""
251
+ if not self.audio_ready:
252
+ return frame, None
253
+
254
+ detected_foot = None
255
+
256
+ try:
257
+ # YOLO detection
258
+ results = self.yolo_model(frame, conf=self.yolo_conf, classes=[0], verbose=False)
259
+
260
+ person_detected = False
261
+ bbox = None
262
+
263
+ for result in results:
264
+ boxes = result.boxes
265
+ if len(boxes) > 0:
266
+ person_detected = True
267
+ box = boxes[0] # Take first person
268
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
269
+ bbox = (int(x1), int(y1), int(x2), int(y2))
270
+
271
+ # Draw YOLO bbox
272
+ cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
273
+ (255, 255, 0), 2)
274
+ break
275
+
276
+ # MediaPipe pose estimation
277
+ if person_detected and bbox:
278
+ # Crop to person region with padding
279
+ x1, y1, x2, y2 = bbox
280
+ pad = 20
281
+ x1 = max(0, x1 - pad)
282
+ y1 = max(0, y1 - pad)
283
+ x2 = min(frame.shape[1], x2 + pad)
284
+ y2 = min(frame.shape[0], y2 + pad)
285
+
286
+ cropped = frame[y1:y2, x1:x2]
287
+ rgb_frame = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
288
+ pose_results = self.pose.process(rgb_frame)
289
+
290
+ if pose_results.pose_landmarks:
291
+ landmarks = pose_results.pose_landmarks.landmark
292
+
293
+ # Get heel positions (adjusted to full frame)
294
+ left_heel = landmarks[self.LEFT_HEEL]
295
+ right_heel = landmarks[self.RIGHT_HEEL]
296
+
297
+ left_y = (left_heel.y * (y2 - y1) + y1) / frame.shape[0]
298
+ right_y = (right_heel.y * (y2 - y1) + y1) / frame.shape[0]
299
+
300
+ # Detect strikes
301
+ left_strike = self.detect_heel_strike(
302
+ left_y, self.prev_left_y, self.left_buffer
303
+ )
304
+ right_strike = self.detect_heel_strike(
305
+ right_y, self.prev_right_y, self.right_buffer
306
+ )
307
+
308
+ if left_strike:
309
+ detected_foot = 'LEFT'
310
+ self.audio_queue.put('LEFT')
311
+ elif right_strike:
312
+ detected_foot = 'RIGHT'
313
+ self.audio_queue.put('RIGHT')
314
+
315
+ # Update previous positions
316
+ self.prev_left_y = left_y
317
+ self.prev_right_y = right_y
318
+
319
+ # Draw skeleton on full frame
320
+ for landmark in landmarks:
321
+ x = int((landmark.x * (x2 - x1) + x1))
322
+ y = int((landmark.y * (y2 - y1) + y1))
323
+ cv2.circle(frame, (x, y), 3, (0, 255, 0), -1)
324
+
325
+ # Highlight heels
326
+ left_heel_x = int((left_heel.x * (x2 - x1) + x1))
327
+ left_heel_y = int((left_heel.y * (y2 - y1) + y1))
328
+ right_heel_x = int((right_heel.x * (x2 - x1) + x1))
329
+ right_heel_y = int((right_heel.y * (y2 - y1) + y1))
330
+
331
+ cv2.circle(frame, (left_heel_x, left_heel_y), 8, (0, 255, 0), -1)
332
+ cv2.circle(frame, (right_heel_x, right_heel_y), 8, (0, 100, 255), -1)
333
+
334
+ if detected_foot:
335
+ # Show strike indicator
336
+ heel_x = left_heel_x if detected_foot == 'LEFT' else right_heel_x
337
+ heel_y = left_heel_y if detected_foot == 'LEFT' else right_heel_y
338
+ color = (0, 255, 0) if detected_foot == 'LEFT' else (0, 100, 255)
339
+
340
+ cv2.circle(frame, (heel_x, heel_y), 30, color, 3)
341
+ cv2.putText(frame, f"{detected_foot} STRIKE!",
342
+ (heel_x - 50, heel_y - 40),
343
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
344
+
345
+ # Draw status
346
+ status_text = "READY" if self.audio_ready else "NO AUDIO"
347
+ status_color = (0, 255, 0) if self.audio_ready else (0, 0, 255)
348
+ cv2.rectangle(frame, (10, 10), (150, 50), (0, 0, 0), -1)
349
+ cv2.putText(frame, status_text, (20, 35),
350
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, status_color, 2)
351
+
352
+ except Exception as e:
353
+ print(f"Frame processing error: {e}")
354
+
355
+ return frame, detected_foot
356
+
357
+ def start(self):
358
+ """Start the detector"""
359
+ self.running = True
360
+ self.start_audio_playback()
361
+
362
+ def stop(self):
363
+ """Stop the detector"""
364
+ self.running = False
365
+ if self.audio_thread:
366
+ self.audio_thread.join(timeout=2)
367
+
368
+
369
+ class HybridFootstepDetectionPipeline:
370
+ """
371
+ Hybrid Detection Pipeline for video files:
372
+ 1. YOLO detects person bounding boxes
373
+ 2. MediaPipe estimates pose on detected regions
374
+ 3. Track footsteps with improved accuracy
375
+ """
376
+
377
+ def __init__(self, fps=30, sensitivity='medium', yolo_conf=0.5):
378
+ self.fps = fps
379
+ self.sensitivity = sensitivity
380
+ self.yolo_conf = yolo_conf
381
+
382
+ # Initialize YOLO detector
383
+ try:
384
+ self.yolo_model = YOLO('yolov8n.pt')
385
+ st.success("βœ… YOLO detector loaded successfully")
386
+ except Exception as e:
387
+ st.warning(f"⚠️ YOLO loading issue: {str(e)}. Downloading model...")
388
+ try:
389
+ self.yolo_model = YOLO('yolov8n.pt')
390
+ st.success("βœ… YOLO detector loaded successfully")
391
+ except Exception as e2:
392
+ st.error(f"❌ Failed to load YOLO: {str(e2)}")
393
+ self.yolo_model = None
394
+
395
+ # Initialize MediaPipe pose estimator
396
+ try:
397
+ self.mp_pose = mp.solutions.pose
398
+ self.pose = self.mp_pose.Pose(
399
+ static_image_mode=False,
400
+ model_complexity=1,
401
+ smooth_landmarks=True,
402
+ min_detection_confidence=0.5,
403
+ min_tracking_confidence=0.5
404
+ )
405
+ st.success("βœ… MediaPipe pose estimator loaded successfully")
406
+ except Exception as e:
407
+ st.error(f"❌ Failed to initialize MediaPipe: {str(e)}")
408
+ self.pose = None
409
+
410
+ # Landmark indices
411
+ self.LEFT_HEEL = 29
412
+ self.RIGHT_HEEL = 30
413
+ self.LEFT_ANKLE = 27
414
+ self.RIGHT_ANKLE = 28
415
+
416
+ # Detection thresholds
417
+ self.thresholds = {
418
+ 'low': {'prominence': 0.02, 'min_interval': 0.4},
419
+ 'medium': {'prominence': 0.015, 'min_interval': 0.3},
420
+ 'high': {'prominence': 0.01, 'min_interval': 0.25}
421
+ }[sensitivity]
422
+
423
+ # Tracking state
424
+ self.person_tracker = PersonTracker()
425
+
426
+ def detect_person_yolo(self, frame):
427
+ """Detect person using YOLO"""
428
+ if self.yolo_model is None:
429
+ return []
430
+
431
+ try:
432
+ results = self.yolo_model(frame, conf=self.yolo_conf, classes=[0], verbose=False)
433
+
434
+ person_boxes = []
435
+ for result in results:
436
+ boxes = result.boxes
437
+ for box in boxes:
438
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
439
+ conf = box.conf[0].cpu().numpy()
440
+ person_boxes.append((int(x1), int(y1), int(x2), int(y2), float(conf)))
441
+
442
+ return person_boxes
443
+ except Exception as e:
444
+ st.warning(f"YOLO detection failed: {str(e)}")
445
+ return []
446
+
447
+ def estimate_pose_mediapipe(self, frame, bbox=None):
448
+ """Estimate pose using MediaPipe on specified region"""
449
+ if self.pose is None:
450
+ return None
451
+
452
+ try:
453
+ if bbox is not None:
454
+ x1, y1, x2, y2 = bbox
455
+ pad = 20
456
+ x1 = max(0, x1 - pad)
457
+ y1 = max(0, y1 - pad)
458
+ x2 = min(frame.shape[1], x2 + pad)
459
+ y2 = min(frame.shape[0], y2 + pad)
460
+
461
+ cropped = frame[y1:y2, x1:x2]
462
+ if cropped.size == 0:
463
+ return None
464
+
465
+ rgb_frame = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)
466
+ results = self.pose.process(rgb_frame)
467
+
468
+ if results.pose_landmarks:
469
+ for landmark in results.pose_landmarks.landmark:
470
+ landmark.x = (landmark.x * (x2 - x1) + x1) / frame.shape[1]
471
+ landmark.y = (landmark.y * (y2 - y1) + y1) / frame.shape[0]
472
+
473
+ return results
474
+ else:
475
+ rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
476
+ return self.pose.process(rgb_frame)
477
+
478
+ except Exception as e:
479
+ return None
480
+
481
+ def process_video(self, video_path, progress_callback=None):
482
+ """Process video with hybrid YOLO-MediaPipe pipeline"""
483
+
484
+ if self.yolo_model is None or self.pose is None:
485
+ st.error("❌ Detection models not available")
486
+ return None
487
+
488
+ cap = cv2.VideoCapture(str(video_path))
489
+ if not cap.isOpened():
490
+ st.error("❌ Could not open video file")
491
+ return None
492
+
493
+ fps = cap.get(cv2.CAP_PROP_FPS)
494
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
495
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
496
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
497
+
498
+ if fps <= 0 or total_frames <= 0:
499
+ st.error("❌ Invalid video properties")
500
+ cap.release()
501
+ return None
502
+
503
+ left_positions = []
504
+ right_positions = []
505
+ detection_confidence = []
506
+ frame_idx = 0
507
+
508
+ yolo_detections = 0
509
+ pose_detections = 0
510
+
511
+ st.info(f"πŸ”„ Processing with Hybrid Pipeline: {total_frames} frames")
512
+
513
+ try:
514
+ while cap.isOpened():
515
+ ret, frame = cap.read()
516
+ if not ret:
517
+ break
518
+
519
+ person_boxes = self.detect_person_yolo(frame)
520
+
521
+ if person_boxes:
522
+ yolo_detections += 1
523
+ best_box = self.person_tracker.select_best_person(person_boxes, frame_idx)
524
+ bbox = best_box[:4]
525
+ results = self.estimate_pose_mediapipe(frame, bbox)
526
+
527
+ if results and results.pose_landmarks:
528
+ pose_detections += 1
529
+ landmarks = results.pose_landmarks.landmark
530
+
531
+ left_y = landmarks[self.LEFT_HEEL].y
532
+ right_y = landmarks[self.RIGHT_HEEL].y
533
+ conf = (landmarks[self.LEFT_HEEL].visibility +
534
+ landmarks[self.RIGHT_HEEL].visibility) / 2
535
+
536
+ left_positions.append(left_y)
537
+ right_positions.append(right_y)
538
+ detection_confidence.append(conf)
539
+ else:
540
+ left_positions.append(np.nan)
541
+ right_positions.append(np.nan)
542
+ detection_confidence.append(0.0)
543
+ else:
544
+ results = self.estimate_pose_mediapipe(frame, bbox=None)
545
+
546
+ if results and results.pose_landmarks:
547
+ pose_detections += 1
548
+ landmarks = results.pose_landmarks.landmark
549
+
550
+ left_positions.append(landmarks[self.LEFT_HEEL].y)
551
+ right_positions.append(landmarks[self.RIGHT_HEEL].y)
552
+ detection_confidence.append(0.5)
553
+ else:
554
+ left_positions.append(np.nan)
555
+ right_positions.append(np.nan)
556
+ detection_confidence.append(0.0)
557
+
558
+ frame_idx += 1
559
+
560
+ if progress_callback and frame_idx % 10 == 0:
561
+ progress = min(frame_idx / total_frames, 1.0)
562
+ progress_callback(progress)
563
+
564
+ except Exception as e:
565
+ st.error(f"❌ Video processing error: {str(e)}")
566
+ cap.release()
567
+ return None
568
+
569
+ cap.release()
570
+
571
+ st.info(
572
+ f"πŸ“Š YOLO detections: {yolo_detections}/{total_frames} frames ({yolo_detections / total_frames * 100:.1f}%)")
573
+ st.info(
574
+ f"πŸ“Š Pose detections: {pose_detections}/{total_frames} frames ({pose_detections / total_frames * 100:.1f}%)")
575
+
576
+ if len(left_positions) == 0:
577
+ st.error("❌ No frames processed successfully")
578
+ return None
579
+
580
+ try:
581
+ left_series = pd.Series(left_positions).interpolate(method='linear')
582
+ left_series = left_series.bfill().ffill()
583
+ left_positions = left_series.values
584
+
585
+ right_series = pd.Series(right_positions).interpolate(method='linear')
586
+ right_series = right_series.bfill().ffill()
587
+ right_positions = right_series.values
588
+
589
+ if len(left_positions) > 5:
590
+ window = min(11, len(left_positions) if len(left_positions) % 2 == 1 else len(left_positions) - 1)
591
+ if window >= 3:
592
+ left_positions = savgol_filter(left_positions, window, 2)
593
+ right_positions = savgol_filter(right_positions, window, 2)
594
+
595
+ left_strikes = self._detect_strikes(left_positions, fps)
596
+ right_strikes = self._detect_strikes(right_positions, fps)
597
+
598
+ events = []
599
+
600
+ for frame in left_strikes:
601
+ events.append({
602
+ 'frame': int(frame),
603
+ 'timecode': self._frames_to_smpte(frame, fps),
604
+ 'foot': 'LEFT',
605
+ 'event': 'HEEL_STRIKE',
606
+ 'time_seconds': frame / fps,
607
+ 'confidence': detection_confidence[int(frame)] if int(frame) < len(detection_confidence) else 0.5
608
+ })
609
+
610
+ for frame in right_strikes:
611
+ events.append({
612
+ 'frame': int(frame),
613
+ 'timecode': self._frames_to_smpte(frame, fps),
614
+ 'foot': 'RIGHT',
615
+ 'event': 'HEEL_STRIKE',
616
+ 'time_seconds': frame / fps,
617
+ 'confidence': detection_confidence[int(frame)] if int(frame) < len(detection_confidence) else 0.5
618
+ })
619
+
620
+ events = sorted(events, key=lambda x: x['frame'])
621
+
622
+ return {
623
+ 'events': events,
624
+ 'fps': fps,
625
+ 'total_frames': total_frames,
626
+ 'width': width,
627
+ 'height': height,
628
+ 'left_positions': left_positions.tolist() if hasattr(left_positions, 'tolist') else left_positions,
629
+ 'right_positions': right_positions.tolist() if hasattr(right_positions, 'tolist') else right_positions,
630
+ 'detection_stats': {
631
+ 'yolo_detections': yolo_detections,
632
+ 'pose_detections': pose_detections,
633
+ 'total_frames': total_frames
634
+ }
635
+ }
636
+
637
+ except Exception as e:
638
+ st.error(f"❌ Data processing error: {str(e)}")
639
+ return None
640
+
641
+ def _detect_strikes(self, positions, fps):
642
+ """Detect heel strikes from position data"""
643
+ try:
644
+ peaks, _ = find_peaks(
645
+ positions,
646
+ prominence=self.thresholds['prominence'],
647
+ distance=int(fps * self.thresholds['min_interval']),
648
+ height=0.7
649
+ )
650
+ return peaks
651
+ except Exception as e:
652
+ st.warning(f"Peak detection failed: {str(e)}")
653
+ return np.array([])
654
+
655
+ def _frames_to_smpte(self, frame, fps):
656
+ """Convert frame number to SMPTE timecode"""
657
+ total_seconds = frame / fps
658
+ hours = int(total_seconds // 3600)
659
+ minutes = int((total_seconds % 3600) // 60)
660
+ seconds = int(total_seconds % 60)
661
+ frames = int((total_seconds * fps) % fps)
662
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d}:{frames:02d}"
663
+
664
+
665
+ class PersonTracker:
666
+ """Track person across frames for consistency"""
667
+
668
+ def __init__(self, iou_threshold=0.3):
669
+ self.tracked_box = None
670
+ self.last_frame = -1
671
+ self.iou_threshold = iou_threshold
672
+
673
+ def calculate_iou(self, box1, box2):
674
+ """Calculate IoU between two bounding boxes"""
675
+ x1_1, y1_1, x2_1, y2_1 = box1[:4]
676
+ x1_2, y1_2, x2_2, y2_2 = box2[:4]
677
+
678
+ xi1 = max(x1_1, x1_2)
679
+ yi1 = max(y1_1, y1_2)
680
+ xi2 = min(x2_1, x2_2)
681
+ yi2 = min(y2_1, y2_2)
682
+
683
+ inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)
684
+
685
+ box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
686
+ box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
687
+
688
+ union_area = box1_area + box2_area - inter_area
689
+
690
+ return inter_area / union_area if union_area > 0 else 0
691
+
692
+ def select_best_person(self, person_boxes, frame_idx):
693
+ """Select best person box for tracking consistency"""
694
+ if not person_boxes:
695
+ return None
696
+
697
+ if self.tracked_box is not None and frame_idx - self.last_frame < 10:
698
+ max_iou = 0
699
+ best_box = None
700
+
701
+ for box in person_boxes:
702
+ iou = self.calculate_iou(self.tracked_box, box)
703
+ if iou > max_iou:
704
+ max_iou = iou
705
+ best_box = box
706
+
707
+ if max_iou > self.iou_threshold:
708
+ self.tracked_box = best_box
709
+ self.last_frame = frame_idx
710
+ return best_box
711
+
712
+ best_box = max(person_boxes, key=lambda x: (x[2] - x[0]) * (x[3] - x[1]) * x[4])
713
+ self.tracked_box = best_box
714
+ self.last_frame = frame_idx
715
+ return best_box
716
+
717
+
718
+ class AudioGenerator:
719
+ """Generate footstep audio"""
720
+
721
+ def __init__(self, sample_rate=44100):
722
+ self.sample_rate = sample_rate
723
+
724
+ def generate_footstep(self, aud_path):
725
+ arr, rate = extract_second_audio_librosa(
726
+ file_path=aud_path,
727
+ target_second=5,
728
+ sample_rate=self.sample_rate
729
+ )
730
+ return arr
731
+
732
+ def create_audio_track(self, events, aud_path, duration=0.3):
733
+ total_samples = int(duration * self.sample_rate)
734
+ audio_track = np.zeros(total_samples, dtype=np.float32)
735
+
736
+ for i, event in enumerate(events):
737
+ step_sound = self.generate_footstep(aud_path)
738
+ pitch_shift = 1.0 + (i % 5 - 2) * 0.03
739
+ indices = np.arange(len(step_sound)) * pitch_shift
740
+ indices = np.clip(indices, 0, len(step_sound) - 1).astype(int)
741
+ step_sound = step_sound[indices]
742
+
743
+ start_sample = int(event['time_seconds'] * self.sample_rate)
744
+ end_sample = min(start_sample + len(step_sound), total_samples)
745
+ sound_len = end_sample - start_sample
746
+
747
+ if sound_len > 0:
748
+ audio_track[start_sample:end_sample] += step_sound[:sound_len]
749
+
750
+ max_val = np.max(np.abs(audio_track))
751
+ if max_val > 0:
752
+ audio_track = audio_track / max_val * 0.8
753
+
754
+ return audio_track
755
+
756
+
757
+ def create_annotated_video(input_path, events, output_path, use_hybrid=True, progress_callback=None):
758
+ """Create annotated video with hybrid detection visualization"""
759
+
760
+ try:
761
+ cap = cv2.VideoCapture(str(input_path))
762
+ if not cap.isOpened():
763
+ st.error("❌ Could not open input video file")
764
+ return False
765
+
766
+ fps = cap.get(cv2.CAP_PROP_FPS)
767
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
768
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
769
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
770
+
771
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
772
+ out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
773
+
774
+ if not out.isOpened():
775
+ st.error("❌ Could not create output video file")
776
+ cap.release()
777
+ return False
778
+
779
+ event_frames = {e['frame']: e for e in events}
780
+
781
+ if use_hybrid:
782
+ yolo_model = YOLO('yolov8n.pt')
783
+ mp_pose = mp.solutions.pose
784
+ pose = mp_pose.Pose(
785
+ static_image_mode=False,
786
+ model_complexity=1,
787
+ smooth_landmarks=True,
788
+ min_detection_confidence=0.5,
789
+ min_tracking_confidence=0.5
790
+ )
791
+ else:
792
+ yolo_model = None
793
+ mp_pose = mp.solutions.pose
794
+ pose = mp_pose.Pose(
795
+ static_image_mode=False,
796
+ model_complexity=1,
797
+ smooth_landmarks=True,
798
+ min_detection_confidence=0.5,
799
+ min_tracking_confidence=0.5
800
+ )
801
+
802
+ frame_idx = 0
803
+
804
+ while cap.isOpened():
805
+ ret, frame = cap.read()
806
+ if not ret:
807
+ break
808
+
809
+ try:
810
+ if use_hybrid and yolo_model:
811
+ results = yolo_model(frame, conf=0.5, classes=[0], verbose=False)
812
+ for result in results:
813
+ boxes = result.boxes
814
+ for box in boxes:
815
+ x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
816
+ conf = box.conf[0].cpu().numpy()
817
+
818
+ cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)),
819
+ (255, 255, 0), 2)
820
+ cv2.putText(frame, f'YOLO: {conf:.2f}',
821
+ (int(x1), int(y1) - 10),
822
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 0), 2)
823
+
824
+ results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
825
+
826
+ if results.pose_landmarks:
827
+ mp.solutions.drawing_utils.draw_landmarks(
828
+ frame,
829
+ results.pose_landmarks,
830
+ mp_pose.POSE_CONNECTIONS,
831
+ landmark_drawing_spec=mp.solutions.drawing_utils.DrawingSpec(
832
+ color=(0, 255, 0), thickness=2, circle_radius=2
833
+ ),
834
+ connection_drawing_spec=mp.solutions.drawing_utils.DrawingSpec(
835
+ color=(255, 255, 255), thickness=2
836
+ )
837
+ )
838
+
839
+ if frame_idx in event_frames:
840
+ event = event_frames[frame_idx]
841
+
842
+ banner_height = 100
843
+ cv2.rectangle(frame, (0, 0), (width, banner_height), (0, 0, 0), -1)
844
+
845
+ text = f"{event['foot']} HEEL STRIKE"
846
+ color = (0, 255, 0) if event['foot'] == 'LEFT' else (0, 100, 255)
847
+
848
+ cv2.putText(frame, text, (50, 50),
849
+ cv2.FONT_HERSHEY_SIMPLEX, 1.5, color, 3)
850
+
851
+ if 'confidence' in event:
852
+ conf_text = f"Conf: {event['confidence']:.2f}"
853
+ cv2.putText(frame, conf_text, (50, 85),
854
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
855
+
856
+ circle_x = 50 if event['foot'] == 'LEFT' else width - 50
857
+ cv2.circle(frame, (circle_x, height - 100), 40, color, -1)
858
+
859
+ if use_hybrid:
860
+ cv2.rectangle(frame, (width - 250, 10), (width - 10, 50), (102, 126, 234), -1)
861
+ cv2.putText(frame, "HYBRID MODE", (width - 240, 35),
862
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
863
+
864
+ time_seconds = frame_idx / fps
865
+ hours = int(time_seconds // 3600)
866
+ minutes = int((time_seconds % 3600) // 60)
867
+ seconds = int(time_seconds % 60)
868
+ frame_num = int((time_seconds * fps) % fps)
869
+ timecode = f"TC: {hours:02d}:{minutes:02d}:{seconds:02d}:{frame_num:02d}"
870
+
871
+ cv2.rectangle(frame, (0, height - 80), (400, height), (0, 0, 0), -1)
872
+ cv2.putText(frame, timecode, (10, height - 30),
873
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
874
+ cv2.putText(frame, f"Frame: {frame_idx}/{total_frames}", (10, height - 55),
875
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
876
+
877
+ out.write(frame)
878
+ frame_idx += 1
879
+
880
+ if progress_callback and frame_idx % 5 == 0:
881
+ progress = min(frame_idx / total_frames, 1.0)
882
+ progress_callback(progress)
883
+
884
+ except Exception as e:
885
+ st.warning(f"⚠️ Error processing frame {frame_idx}: {str(e)}")
886
+ frame_idx += 1
887
+ continue
888
+
889
+ cap.release()
890
+ out.release()
891
+ pose.close()
892
+
893
+ return True
894
+
895
+ except Exception as e:
896
+ st.error(f"❌ Video annotation failed: {str(e)}")
897
+ try:
898
+ cap.release()
899
+ out.release()
900
+ pose.close()
901
+ except:
902
+ pass
903
+ return False
904
+
905
+
906
+ def merge_audio_with_video(video_path, audio_track, sample_rate, output_path):
907
+ """Merge audio with video using FFmpeg"""
908
+
909
+ temp_audio = tempfile.mktemp(suffix='.wav')
910
+ sf.write(temp_audio, audio_track, sample_rate)
911
+
912
+ ffmpeg_cmd = FFMPEG_PATH if FFMPEG_PATH else "ffmpeg"
913
+
914
+ cmd = [
915
+ ffmpeg_cmd, '-y',
916
+ '-i', str(video_path),
917
+ '-i', temp_audio,
918
+ '-map', '0:v', '-map', '1:a',
919
+ '-c:v', 'libx264', '-preset', 'medium',
920
+ '-c:a', 'aac', '-b:a', '192k',
921
+ '-shortest',
922
+ str(output_path)
923
+ ]
924
+
925
+ try:
926
+ if FFMPEG_PATH is None:
927
+ st.warning("FFmpeg not found. Using fallback method.")
928
+ return None
929
+
930
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=30)
931
+ return True
932
+
933
+ except subprocess.CalledProcessError as e:
934
+ st.error(f"FFmpeg error: {e.stderr}")
935
+ return False
936
+ except subprocess.TimeoutExpired:
937
+ st.error("FFmpeg timed out")
938
+ return False
939
+ finally:
940
+ if os.path.exists(temp_audio):
941
+ os.remove(temp_audio)
942
+
943
+
944
+ def live_streaming_mode():
945
+ """Live streaming mode with frame capture and real-time detection"""
946
+
947
+ st.markdown('<h2>πŸ“Ή Live Streaming Mode</h2>', unsafe_allow_html=True)
948
+ st.info("πŸŽ₯ This mode allows real-time footstep detection with your device camera")
949
+
950
+ # Initialize session state
951
+ if 'floor_frame_captured' not in st.session_state:
952
+ st.session_state.floor_frame_captured = False
953
+ if 'audio_downloaded' not in st.session_state:
954
+ st.session_state.audio_downloaded = False
955
+ if 'live_audio_path' not in st.session_state:
956
+ st.session_state.live_audio_path = None
957
+ if 'live_detector' not in st.session_state:
958
+ st.session_state.live_detector = None
959
+ if 'camera_active' not in st.session_state:
960
+ st.session_state.camera_active = False
961
+
962
+ # Step 1: Capture floor frame
963
+ st.markdown("### Step 1: Capture Floor Frame πŸ“Έ")
964
+ st.write("Capture a single frame showing the floor surface for audio analysis")
965
+
966
+ col1, col2 = st.columns([2, 1])
967
+
968
+ with col1:
969
+ # Camera input for frame capture
970
+ camera_image = st.camera_input("Capture floor image", key="floor_capture")
971
+
972
+ if camera_image is not None and not st.session_state.floor_frame_captured:
973
+ # Save captured frame
974
+ image = Image.open(camera_image)
975
+ temp_frame_path = tempfile.mktemp(suffix='.jpg')
976
+ image.save(temp_frame_path)
977
+ st.session_state.floor_frame_path = temp_frame_path
978
+
979
+ # Display captured frame
980
+ st.image(image, caption="Captured Floor Frame", use_container_width=True)
981
+
982
+ if st.button("βœ… Confirm Floor Capture", type="primary", use_container_width=True):
983
+ st.session_state.floor_frame_captured = True
984
+ st.success("βœ… Floor frame captured successfully!")
985
+ st.rerun()
986
+
987
+ with col2:
988
+ if st.session_state.floor_frame_captured:
989
+ st.markdown('<div class="success-box">βœ… Floor Captured</div>', unsafe_allow_html=True)
990
+ else:
991
+ st.info("πŸ“Έ Capture floor frame to proceed")
992
+
993
+ # Step 2: Analyze and download audio
994
+ if st.session_state.floor_frame_captured and not st.session_state.audio_downloaded:
995
+ st.markdown("---")
996
+ st.markdown("### Step 2: Analyze Floor & Download Audio πŸ”Š")
997
+
998
+ col1, col2 = st.columns([2, 1])
999
+
1000
+ with col1:
1001
+ if st.button("πŸ” Analyze Floor & Generate Audio", type="primary", use_container_width=True):
1002
+ with st.spinner("πŸ”„ Analyzing floor surface and generating audio..."):
1003
+ try:
1004
+ # Create temporary video from frame for processing
1005
+ temp_video = tempfile.mktemp(suffix='.mp4')
1006
+
1007
+ # Create 1-second video from the captured frame
1008
+ img = cv2.imread(st.session_state.floor_frame_path)
1009
+ height, width = img.shape[:2]
1010
+
1011
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
1012
+ out = cv2.VideoWriter(temp_video, fourcc, 30, (width, height))
1013
+
1014
+ # Write 30 frames (1 second at 30fps)
1015
+ for _ in range(30):
1016
+ out.write(img)
1017
+ out.release()
1018
+
1019
+ # Process video for footstep audio
1020
+ st.info("🎡 Generating footstep audio based on floor analysis...")
1021
+
1022
+ aud_path="audio/Footsteps on Gravel Path Outdoor.mp3"
1023
+
1024
+ st.session_state.live_audio_path = aud_path
1025
+ st.session_state.audio_downloaded = True
1026
+
1027
+ # Clean up temp video
1028
+ if os.path.exists(temp_video):
1029
+ os.remove(temp_video)
1030
+
1031
+ st.success("βœ… Audio generated successfully!")
1032
+ st.balloons()
1033
+ st.rerun()
1034
+
1035
+ except Exception as e:
1036
+ st.error(f"❌ Error generating audio: {str(e)}")
1037
+
1038
+ with col2:
1039
+ st.info("🎡 Audio will be generated based on floor type")
1040
+
1041
+ # Step 3: Initialize live detector
1042
+ if st.session_state.audio_downloaded and st.session_state.live_detector is None:
1043
+ st.markdown("---")
1044
+ st.markdown("### Step 3: Initialize Live Detection πŸš€")
1045
+
1046
+ col1, col2 = st.columns([2, 1])
1047
+
1048
+ with col1:
1049
+ sensitivity = st.select_slider(
1050
+ "Detection Sensitivity",
1051
+ options=['low', 'medium', 'high'],
1052
+ value='medium'
1053
+ )
1054
+
1055
+ yolo_conf = st.slider(
1056
+ "YOLO Confidence",
1057
+ min_value=0.1,
1058
+ max_value=0.9,
1059
+ value=0.5,
1060
+ step=0.05
1061
+ )
1062
+
1063
+ if st.button("🎬 Initialize Live Detector", type="primary", use_container_width=True):
1064
+ with st.spinner("βš™οΈ Initializing detector..."):
1065
+ try:
1066
+ detector = LiveFootstepDetector(
1067
+ audio_path=st.session_state.live_audio_path,
1068
+ sensitivity=sensitivity,
1069
+ yolo_conf=yolo_conf
1070
+ )
1071
+ st.session_state.live_detector = detector
1072
+ st.success("βœ… Live detector initialized!")
1073
+ st.rerun()
1074
+ except Exception as e:
1075
+ st.error(f"❌ Failed to initialize detector: {str(e)}")
1076
+
1077
+ with col2:
1078
+ st.info("πŸ€– Configure detection parameters")
1079
+
1080
+ # Step 4: Start live detection
1081
+ if st.session_state.live_detector is not None:
1082
+ st.markdown("---")
1083
+ st.markdown('<div class="ready-badge">βœ… SYSTEM READY</div>', unsafe_allow_html=True)
1084
+ st.markdown("### Step 4: Live Detection 🎯")
1085
+
1086
+ col1, col2 = st.columns([3, 1])
1087
+
1088
+ with col1:
1089
+ st.write("πŸ“Ή **Camera is ready for live footstep detection**")
1090
+ st.write("🚢 Walk in front of the camera and hear footsteps in real-time!")
1091
+
1092
+ # Start/Stop controls
1093
+ col_a, col_b = st.columns(2)
1094
+
1095
+ with col_a:
1096
+ if not st.session_state.camera_active:
1097
+ if st.button("▢️ Start Live Detection", type="primary", use_container_width=True):
1098
+ st.session_state.camera_active = True
1099
+ st.session_state.live_detector.start()
1100
+ st.rerun()
1101
+
1102
+ with col_b:
1103
+ if st.session_state.camera_active:
1104
+ if st.button("⏹️ Stop Detection", type="secondary", use_container_width=True):
1105
+ st.session_state.camera_active = False
1106
+ st.session_state.live_detector.stop()
1107
+ st.rerun()
1108
+
1109
+ with col2:
1110
+ if st.session_state.camera_active:
1111
+ st.markdown('<div class="live-indicator">πŸ”΄ LIVE</div>', unsafe_allow_html=True)
1112
+ else:
1113
+ st.info("⏸️ Paused")
1114
+
1115
+ # Live video feed
1116
+ if st.session_state.camera_active:
1117
+ st.markdown("---")
1118
+
1119
+ FRAME_WINDOW = st.image([])
1120
+
1121
+ cap = cv2.VideoCapture(0)
1122
+
1123
+ if not cap.isOpened():
1124
+ st.error("❌ Cannot access camera. Please check permissions.")
1125
+ st.session_state.camera_active = False
1126
+ else:
1127
+ st.info("πŸ“Ή Live feed active - Walk to generate footsteps!")
1128
+
1129
+ # Statistics
1130
+ step_counter = st.empty()
1131
+ left_steps = 0
1132
+ right_steps = 0
1133
+
1134
+ try:
1135
+ while st.session_state.camera_active:
1136
+ ret, frame = cap.read()
1137
+
1138
+ if not ret:
1139
+ st.error("❌ Failed to read from camera")
1140
+ break
1141
+
1142
+ # Process frame
1143
+ processed_frame, detected_foot = st.session_state.live_detector.process_frame(frame)
1144
+
1145
+ # Update counters
1146
+ if detected_foot == 'LEFT':
1147
+ left_steps += 1
1148
+ elif detected_foot == 'RIGHT':
1149
+ right_steps += 1
1150
+
1151
+ # Display frame
1152
+ FRAME_WINDOW.image(cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB))
1153
+
1154
+ # Update statistics
1155
+ step_counter.metric("Total Steps Detected", left_steps + right_steps,
1156
+ f"L: {left_steps} | R: {right_steps}")
1157
+
1158
+ # Check if user stopped
1159
+ if not st.session_state.camera_active:
1160
+ break
1161
+
1162
+ time.sleep(0.033) # ~30 FPS
1163
+
1164
+ except Exception as e:
1165
+ st.error(f"❌ Error during live detection: {str(e)}")
1166
+
1167
+ finally:
1168
+ cap.release()
1169
+ st.session_state.live_detector.stop()
1170
+
1171
+ # Reset button
1172
+ st.markdown("---")
1173
+ if st.button("πŸ”„ Reset All", use_container_width=True):
1174
+ st.session_state.floor_frame_captured = False
1175
+ st.session_state.audio_downloaded = False
1176
+ st.session_state.live_audio_path = None
1177
+ st.session_state.live_detector = None
1178
+ st.session_state.camera_active = False
1179
+ st.rerun()
1180
+
1181
+
1182
+ def video_upload_mode():
1183
+ """Original video upload mode"""
1184
+
1185
+ st.markdown('<h2>πŸ“€ Video Upload Mode</h2>', unsafe_allow_html=True)
1186
+
1187
+ # Sidebar configuration
1188
+ sensitivity = st.sidebar.select_slider(
1189
+ "Footstep Sensitivity",
1190
+ options=['low', 'medium', 'high'],
1191
+ value='medium',
1192
+ help="Higher sensitivity detects more subtle footsteps"
1193
+ )
1194
+
1195
+ yolo_conf = st.sidebar.slider(
1196
+ "YOLO Confidence",
1197
+ min_value=0.1,
1198
+ max_value=0.9,
1199
+ value=0.5,
1200
+ step=0.05,
1201
+ help="Confidence threshold for YOLO person detection"
1202
+ )
1203
+
1204
+ surface_type = st.sidebar.selectbox(
1205
+ "Surface Type",
1206
+ ['concrete', 'wood', 'grass', 'gravel', 'metal'],
1207
+ help="Select surface for audio generation"
1208
+ )
1209
+
1210
+ use_hybrid = st.sidebar.checkbox(
1211
+ "Enable Hybrid Mode",
1212
+ value=True,
1213
+ help="Use YOLO for person detection + MediaPipe for pose estimation"
1214
+ )
1215
+
1216
+ create_annotated = st.sidebar.checkbox("Create Annotated Video", value=True)
1217
+ add_audio = st.sidebar.checkbox("Add Footstep Audio", value=True)
1218
+
1219
+ # File uploader
1220
+ uploaded_file = st.file_uploader(
1221
+ "πŸ“€ Upload Video File",
1222
+ type=['mp4', 'avi', 'mov', 'mkv'],
1223
+ help="Upload a video file to detect footsteps"
1224
+ )
1225
+
1226
+ if uploaded_file:
1227
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmp_file:
1228
+ tmp_file.write(uploaded_file.read())
1229
+ video_path = tmp_file.name
1230
+
1231
+ col1, col2 = st.columns([2, 1])
1232
+
1233
+ with col1:
1234
+ st.subheader("πŸ“Ή Input Video")
1235
+ st.video(video_path)
1236
+
1237
+ with col2:
1238
+ st.subheader("ℹ️ Video Info")
1239
+ cap = cv2.VideoCapture(video_path)
1240
+ video_info = {
1241
+ "Duration": f"{cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS):.2f}s",
1242
+ "FPS": f"{cap.get(cv2.CAP_PROP_FPS):.2f}",
1243
+ "Resolution": f"{int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))}x{int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))}",
1244
+ "Frames": int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
1245
+ }
1246
+ cap.release()
1247
+
1248
+ for key, value in video_info.items():
1249
+ st.metric(key, value)
1250
+
1251
+ if use_hybrid:
1252
+ st.success("πŸ€– Hybrid Mode Active")
1253
+ else:
1254
+ st.info("πŸ“Š MediaPipe Only")
1255
+
1256
+ st.markdown("---")
1257
+
1258
+ if st.button("πŸš€ Process Video", type="primary", use_container_width=True):
1259
+
1260
+ if use_hybrid:
1261
+ st.info("πŸ”„ Running Hybrid YOLO-MediaPipe Pipeline...")
1262
+ pipeline = HybridFootstepDetectionPipeline(
1263
+ fps=float(video_info["FPS"]),
1264
+ sensitivity=sensitivity,
1265
+ yolo_conf=yolo_conf
1266
+ )
1267
+ else:
1268
+ st.info("πŸ”„ Running MediaPipe-Only Pipeline...")
1269
+ pipeline = HybridFootstepDetectionPipeline(
1270
+ fps=float(video_info["FPS"]),
1271
+ sensitivity=sensitivity,
1272
+ yolo_conf=yolo_conf
1273
+ )
1274
+
1275
+ with st.spinner("πŸ” Detecting footsteps..."):
1276
+ progress_bar = st.progress(0)
1277
+ status_text = st.empty()
1278
+
1279
+ def update_progress(val):
1280
+ progress_bar.progress(val)
1281
+ status_text.text(f"Processing: {int(val * 100)}%")
1282
+
1283
+ results = pipeline.process_video(video_path, update_progress)
1284
+ st.session_state['results'] = results
1285
+ st.session_state['video_path'] = video_path
1286
+ st.session_state['use_hybrid'] = use_hybrid
1287
+
1288
+ progress_bar.empty()
1289
+ status_text.empty()
1290
+
1291
+ if results:
1292
+ st.markdown('<div class="success-box">βœ… Footstep detection complete!</div>',
1293
+ unsafe_allow_html=True)
1294
+ st.success(f"Detected **{len(results['events'])}** footstep events")
1295
+
1296
+ if 'detection_stats' in results:
1297
+ stats = results['detection_stats']
1298
+ col1, col2, col3 = st.columns(3)
1299
+ col1.metric("YOLO Detections",
1300
+ f"{stats['yolo_detections']}/{stats['total_frames']}")
1301
+ col2.metric("Pose Detections",
1302
+ f"{stats['pose_detections']}/{stats['total_frames']}")
1303
+ col3.metric("Success Rate",
1304
+ f"{stats['pose_detections'] / stats['total_frames'] * 100:.1f}%")
1305
+
1306
+ # Display results (existing code continues...)
1307
+ if 'results' in st.session_state:
1308
+ results = st.session_state['results']
1309
+
1310
+ st.markdown("---")
1311
+ st.subheader("πŸ“Š Detection Results")
1312
+
1313
+ col1, col2, col3, col4 = st.columns(4)
1314
+
1315
+ left_count = len([e for e in results['events'] if e['foot'] == 'LEFT'])
1316
+ right_count = len([e for e in results['events'] if e['foot'] == 'RIGHT'])
1317
+ avg_cadence = len(results['events']) / (results['total_frames'] / results['fps']) * 60
1318
+ avg_conf = np.mean([e.get('confidence', 0.5) for e in results['events']])
1319
+
1320
+ col1.metric("Total Events", len(results['events']))
1321
+ col2.metric("Left Foot", left_count)
1322
+ col3.metric("Right Foot", right_count)
1323
+ col4.metric("Avg Confidence", f"{avg_conf:.2f}")
1324
+
1325
+ st.metric("Average Cadence", f"{avg_cadence:.1f} steps/min")
1326
+
1327
+ st.subheader("πŸ“‹ Detected Events")
1328
+ events_df = pd.DataFrame(results['events'])
1329
+
1330
+ if not events_df.empty:
1331
+ st.dataframe(
1332
+ events_df.style.apply(
1333
+ lambda x: ['background-color: #e8f5e9' if x.foot == 'LEFT'
1334
+ else 'background-color: #fff3e0' for _ in x],
1335
+ axis=1
1336
+ ),
1337
+ use_container_width=True,
1338
+ height=300
1339
+ )
1340
+
1341
+ st.subheader("πŸ’Ύ Export Options")
1342
+
1343
+ col1, col2, col3 = st.columns(3)
1344
+
1345
+ with col1:
1346
+ csv = events_df.to_csv(index=False)
1347
+ st.download_button(
1348
+ "πŸ“„ Download CSV",
1349
+ csv,
1350
+ f"footsteps_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
1351
+ "text/csv",
1352
+ use_container_width=True
1353
+ )
1354
+
1355
+ with col2:
1356
+ json_data = json.dumps(results['events'], indent=2)
1357
+ st.download_button(
1358
+ "πŸ“‹ Download JSON",
1359
+ json_data,
1360
+ f"footsteps_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
1361
+ "application/json",
1362
+ use_container_width=True
1363
+ )
1364
+
1365
+ with col3:
1366
+ timecode_text = "\n".join([
1367
+ f"{e['timecode']}\t{e['foot']}\t{e['event']}\t{e.get('confidence', 0.5):.2f}"
1368
+ for e in results['events']
1369
+ ])
1370
+ st.download_button(
1371
+ "⏱️ Download Timecode",
1372
+ timecode_text,
1373
+ f"timecode_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
1374
+ "text/plain",
1375
+ use_container_width=True
1376
+ )
1377
+
1378
+ st.markdown("---")
1379
+ st.subheader("πŸŽ₯ Generate Output Video")
1380
+
1381
+ col1, col2 = st.columns(2)
1382
+
1383
+ with col1:
1384
+ if create_annotated and st.button("Create Annotated Video", use_container_width=True):
1385
+ with st.spinner("Creating annotated video..."):
1386
+ annotated_path = tempfile.mktemp(suffix='_annotated.mp4')
1387
+ progress_bar = st.progress(0)
1388
+
1389
+ success = create_annotated_video(
1390
+ st.session_state['video_path'],
1391
+ results['events'],
1392
+ annotated_path,
1393
+ use_hybrid=st.session_state.get('use_hybrid', False),
1394
+ progress_callback=lambda v: progress_bar.progress(v)
1395
+ )
1396
+
1397
+ if success:
1398
+ st.session_state['annotated_video'] = annotated_path
1399
+ progress_bar.empty()
1400
+ st.success("βœ… Annotated video ready!")
1401
+ else:
1402
+ st.error("❌ Failed to create annotated video")
1403
+
1404
+ with col2:
1405
+ if add_audio and st.button("Generate with Audio", use_container_width=True):
1406
+ with st.spinner("Generating audio and merging..."):
1407
+ audio_gen = AudioGenerator()
1408
+ aud_path="audio/Footsteps on Gravel Path Outdoor.mp3"
1409
+ duration = results['total_frames'] / results['fps']
1410
+ audio_track = audio_gen.create_audio_track(
1411
+ results['events'],
1412
+ aud_path,
1413
+ duration
1414
+ )
1415
+
1416
+ temp_video = tempfile.mktemp(suffix='_temp.mp4')
1417
+ progress_bar = st.progress(0)
1418
+
1419
+ create_annotated_video(
1420
+ st.session_state['video_path'],
1421
+ results['events'],
1422
+ temp_video,
1423
+ use_hybrid=st.session_state.get('use_hybrid', False),
1424
+ progress_callback=lambda v: progress_bar.progress(v * 0.7)
1425
+ )
1426
+
1427
+ final_output = tempfile.mktemp(suffix='_final.mp4')
1428
+ success = merge_audio_with_video(
1429
+ temp_video,
1430
+ audio_track,
1431
+ 44100,
1432
+ final_output
1433
+ )
1434
+
1435
+ progress_bar.progress(1.0)
1436
+ progress_bar.empty()
1437
+
1438
+ if success:
1439
+ st.session_state['final_video'] = final_output
1440
+ st.success("βœ… Video with audio ready!")
1441
+ else:
1442
+ st.error("❌ Failed to merge audio")
1443
+
1444
+ if 'annotated_video' in st.session_state:
1445
+ st.markdown("---")
1446
+ st.subheader("πŸ“Ί Annotated Video")
1447
+ st.video(st.session_state['annotated_video'])
1448
+
1449
+ with open(st.session_state['annotated_video'], 'rb') as f:
1450
+ st.download_button(
1451
+ "πŸ“₯ Download Annotated Video",
1452
+ f,
1453
+ f"annotated_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4",
1454
+ "video/mp4",
1455
+ use_container_width=True
1456
+ )
1457
+
1458
+ if 'final_video' in st.session_state:
1459
+ st.markdown("---")
1460
+ st.subheader("πŸ”Š Final Video with Audio")
1461
+ st.video(st.session_state['final_video'])
1462
+
1463
+ with open(st.session_state['final_video'], 'rb') as f:
1464
+ st.download_button(
1465
+ "πŸ“₯ Download Final Video",
1466
+ f,
1467
+ f"final_with_audio_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4",
1468
+ "video/mp4",
1469
+ use_container_width=True
1470
+ )
1471
+
1472
+
1473
+ def main():
1474
+ st.markdown('<h1 class="main-header">🎬 Hybrid YOLO-MediaPipe Footstep Detection</h1>',
1475
+ unsafe_allow_html=True)
1476
+ st.markdown('<div class="hybrid-badge">πŸš€ YOLO Person Detection + MediaPipe Pose Estimation</div>',
1477
+ unsafe_allow_html=True)
1478
+ st.markdown("### Advanced AI-Powered Foley Tool with Dual-Stage Detection Pipeline")
1479
+
1480
+ # Mode selection
1481
+ st.markdown("---")
1482
+ st.markdown("## 🎯 Select Mode")
1483
+
1484
+ col1, col2 = st.columns(2)
1485
+
1486
+ with col1:
1487
+ if st.button("πŸ“€ Video Upload Mode", use_container_width=True, type="primary"):
1488
+ st.session_state.mode = 'upload'
1489
+
1490
+ with col2:
1491
+ if st.button("πŸ“Ή Live Streaming Mode", use_container_width=True, type="primary"):
1492
+ st.session_state.mode = 'live'
1493
+
1494
+ # Initialize mode
1495
+ if 'mode' not in st.session_state:
1496
+ st.session_state.mode = 'upload'
1497
+
1498
+ st.markdown("---")
1499
+
1500
+ # Display selected mode
1501
+ if st.session_state.mode == 'upload':
1502
+ video_upload_mode()
1503
+ else:
1504
+ live_streaming_mode()
1505
+
1506
+ # Sidebar info
1507
+ with st.sidebar:
1508
+ st.markdown("---")
1509
+ st.markdown(f"### 🎯 Current Mode: **{st.session_state.mode.upper()}**")
1510
+
1511
+ if st.session_state.mode == 'live':
1512
+ st.markdown("---")
1513
+ st.markdown("### πŸ“Ή Live Mode Guide")
1514
+ st.markdown("""
1515
+ **Steps:**
1516
+ 1. πŸ“Έ **Capture Floor Frame**
1517
+ - Point camera at floor
1518
+ - Capture clear image
1519
+
1520
+ 2. πŸ”Š **Generate Audio**
1521
+ - AI analyzes floor type
1522
+ - Downloads matching sound
1523
+
1524
+ 3. βœ… **System Ready**
1525
+ - Real-time detection active
1526
+ - Walk and hear footsteps!
1527
+
1528
+ **Tips:**
1529
+ - Good lighting needed
1530
+ - Clear floor view
1531
+ - Stand 2-3 meters away
1532
+ - Walk naturally
1533
+ """)
1534
+
1535
+ st.markdown("---")
1536
+ st.markdown("### πŸ€– Hybrid Pipeline")
1537
+ st.markdown("""
1538
+ **Stage 1: YOLO Detection**
1539
+ - Detects person in frame
1540
+ - Provides bounding box
1541
+ - Tracks across frames
1542
+
1543
+ **Stage 2: MediaPipe Pose**
1544
+ - Estimates pose on detected region
1545
+ - Extracts heel landmarks
1546
+ - Higher accuracy & speed
1547
+
1548
+ **Benefits:**
1549
+ - βœ… More robust detection
1550
+ - βœ… Better occlusion handling
1551
+ - βœ… Faster processing
1552
+ - βœ… Improved accuracy
1553
+ """)
1554
+
1555
+ st.markdown("---")
1556
+ st.markdown("### ℹ️ System Info")
1557
+ st.markdown("""
1558
+ **Detection Engines:**
1559
+ - YOLOv8 (Person Detection)
1560
+ - MediaPipe Pose v2 (Pose Estimation)
1561
+
1562
+ **Features:**
1563
+ - Dual-stage AI pipeline
1564
+ - Person tracking
1565
+ - Frame-accurate timing
1566
+ - Confidence scoring
1567
+ - Real-time live detection
1568
+ - Autonomous audio generation
1569
+ """)
1570
+
1571
+
1572
+ if __name__ == "__main__":
1573
+ main()
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ streamlit==1.31.1
3
+ fastapi==0.109.0
4
+ uvicorn[standard]==0.27.0
5
+ python-multipart==0.0.9
6
+
7
+ # Computer Vision & AI
8
+ opencv-python-headless==4.9.0.80
9
+ mediapipe==0.10.9
10
+ ultralytics==8.1.0
11
+ Pillow==10.2.0
12
+
13
+ # Data Processing
14
+ numpy==1.24.3
15
+ pandas==2.2.0
16
+ scipy==1.12.0
17
+
18
+ # Audio Processing
19
+ soundfile==0.12.1
20
+ librosa==0.10.1
21
+
22
+ # LangChain & AI
23
+ langchain-core==0.1.23
24
+ pydantic==2.6.0
25
+
26
+ # API & Utilities
27
+ requests==2.31.0
28
+ python-dotenv==1.0.1
29
+ beautifulsoup4==4.12.3
30
+ yt-dlp==2024.3.10
31
+
32
+ # Google AI
33
+ absl-py==2.1.0
sound_agent.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yt_dlp
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ import re
6
+ import subprocess
7
+
8
+ # Set the path to your FFmpeg executable - prioritize system ffmpeg for Docker
9
+ def get_ffmpeg_path():
10
+ """Get FFmpeg path with fallback options"""
11
+ possible_paths = [
12
+ "ffmpeg", # System ffmpeg (Docker/Linux)
13
+ r"C:\Users\abhiv\OneDrive\Desktop\agentic ai\SoundFeet\ffmpeg-7.1-essentials_build\bin\ffmpeg.exe",
14
+ "./ffmpeg-7.1-essentials_build/bin/ffmpeg.exe",
15
+ ]
16
+ for path in possible_paths:
17
+ try:
18
+ if path == "ffmpeg" or not path.endswith('.exe'):
19
+ result = subprocess.run([path, '-version'], capture_output=True, timeout=5)
20
+ if result.returncode == 0:
21
+ return path
22
+ elif os.path.exists(path):
23
+ return path
24
+ except:
25
+ continue
26
+ return "ffmpeg"
27
+
28
+ FFMPEG_PATH = get_ffmpeg_path()
29
+
30
+ def create_audio_folder():
31
+ """Create audio folder if it doesn't exist"""
32
+ if not os.path.exists("audio"):
33
+ os.makedirs("audio")
34
+ return "audio"
35
+
36
+
37
+ def check_ffmpeg():
38
+ """Check if FFmpeg is available at the specified path"""
39
+ if not os.path.exists(FFMPEG_PATH):
40
+ print(f"❌ FFmpeg not found at: {FFMPEG_PATH}")
41
+ print("Please check the path and make sure FFmpeg is installed.")
42
+ return False
43
+ print(f"βœ… FFmpeg found at: {FFMPEG_PATH}")
44
+ return True
45
+
46
+
47
+ def search_and_download_audio(audio_name):
48
+ """Search and download audio using yt-dlp's built-in search"""
49
+ audio_folder = create_audio_folder()
50
+ sanitized_name = sanitize_filename(audio_name)
51
+
52
+ # Configure yt-dlp with FFmpeg path
53
+ ydl_opts = {
54
+ 'format': 'bestaudio/best',
55
+ 'outtmpl': f'{audio_folder}/{sanitized_name}.%(ext)s',
56
+ 'postprocessors': [{
57
+ 'key': 'FFmpegExtractAudio',
58
+ 'preferredcodec': 'mp3',
59
+ 'preferredquality': '192',
60
+ }],
61
+ 'ffmpeg_location': os.path.dirname(FFMPEG_PATH),
62
+ 'default_search': 'ytsearch', # Use YouTube search
63
+ 'noplaylist': True, # Download only single video, not playlist
64
+ }
65
+
66
+ try:
67
+ print(f"πŸ” Searching for '{audio_name}' on YouTube...")
68
+
69
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
70
+ # Search and download the first result
71
+ search_query = f"{audio_name} audio"
72
+ ydl.download([search_query])
73
+
74
+ # Check if file was created
75
+ mp3_file = os.path.join(audio_folder, f"{sanitized_name}.mp3")
76
+ if os.path.exists(mp3_file):
77
+ file_size = os.path.getsize(mp3_file) / (1024 * 1024) # Size in MB
78
+ print(f"βœ… Audio '{sanitized_name}' downloaded successfully! ({file_size:.2f} MB)")
79
+ return ydl_opts['outtmpl']
80
+ else:
81
+ print("❌ Downloaded file not found.")
82
+ return False
83
+
84
+ except yt_dlp.utils.DownloadError as e:
85
+ print(f"❌ Download error: {e}")
86
+ return False
87
+ except Exception as e:
88
+ print(f"❌ Unexpected error: {e}")
89
+ return False
90
+
91
+
92
+ def search_youtube_improved(audio_name):
93
+ """Alternative search method with better headers"""
94
+ search_query = f"{audio_name} audio"
95
+ url = f"https://www.youtube.com/results?search_query={search_query.replace(' ', '+')}"
96
+
97
+ try:
98
+ headers = {
99
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
100
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
101
+ 'Accept-Language': 'en-US,en;q=0.5',
102
+ 'Accept-Encoding': 'gzip, deflate',
103
+ 'Connection': 'keep-alive',
104
+ }
105
+
106
+ response = requests.get(url, headers=headers, timeout=10)
107
+ response.raise_for_status()
108
+
109
+ # Extract video IDs using regex from the page source
110
+ video_ids = re.findall(r'watch\?v=([a-zA-Z0-9_-]{11})', response.text)
111
+
112
+ # Remove duplicates and create full URLs
113
+ video_links = []
114
+ for video_id in video_ids:
115
+ url = f"https://www.youtube.com/watch?v={video_id}"
116
+ if url not in video_links:
117
+ video_links.append(url)
118
+
119
+ return video_links[:5] # Return top 5 results
120
+
121
+ except Exception as e:
122
+ print(f"❌ Error searching YouTube: {e}")
123
+ return []
124
+
125
+
126
+ def sanitize_filename(name):
127
+ """Remove invalid characters from filename"""
128
+ invalid_chars = '<>:"/\\|?*'
129
+ for char in invalid_chars:
130
+ name = name.replace(char, '')
131
+ return name.strip()
132
+
133
+
134
+ def main_sound(audio_name):
135
+ print("🎡 Audio Downloader")
136
+ print("=" * 40)
137
+
138
+ # Check FFmpeg availability first
139
+ if not check_ffmpeg():
140
+ return None
141
+ if not audio_name:
142
+ print("❌ Please enter a valid audio name.")
143
+ return None
144
+
145
+ # Try the direct download method first (more reliable)
146
+ print("\nπŸ”„ Trying direct download method...")
147
+ file_path = search_and_download_audio(audio_name)
148
+ if file_path:
149
+ print(f"πŸŽ‰ Success! Audio saved as '{sanitize_filename(audio_name)}.mp3'")
150
+ return file_path
151
+ else:
152
+ print("\nπŸ”„ Direct method failed, trying alternative search...")
153
+
154
+ # Try alternative search method
155
+ video_urls = search_youtube_improved(audio_name)
156
+
157
+ if not video_urls:
158
+ print("❌ No audio found. Please try a different name.")
159
+ print(
160
+ "πŸ’‘ Try more specific terms like: 'city street sounds', 'footsteps on pavement', 'urban ambient noise'")
161
+ return None
162
+
163
+ print(f"πŸ“₯ Found {len(video_urls)} results. Downloading the first one...")
164
+
165
+ # Download using the traditional method
166
+ file_path = download_audio_direct(audio_name, video_urls[0])
167
+ if file_path:
168
+ print(f"πŸŽ‰ Audio saved in 'audio' folder!")
169
+ return file_path
170
+ else:
171
+ print("❌ All download methods failed.")
172
+ return None
173
+
174
+
175
+ def download_audio_direct(audio_name, url):
176
+ """Direct download method for specific URLs"""
177
+ audio_folder = create_audio_folder()
178
+ sanitized_name = sanitize_filename(audio_name)
179
+
180
+ ydl_opts = {
181
+ 'format': 'bestaudio/best',
182
+ 'outtmpl': f'{audio_folder}/{sanitized_name}.%(ext)s',
183
+ 'postprocessors': [{
184
+ 'key': 'FFmpegExtractAudio',
185
+ 'preferredcodec': 'mp3',
186
+ 'preferredquality': '192',
187
+ }],
188
+ 'ffmpeg_location': os.path.dirname(FFMPEG_PATH),
189
+ }
190
+
191
+ try:
192
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
193
+ ydl.download([url])
194
+ return ydl_opts['outtmpl']
195
+ except Exception as e:
196
+ print(f"❌ Error: {e}")
197
+ return False
198
+