Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- README.md +365 -13
- download_images.py +220 -0
- photos_url.csv +0 -0
- requirements.txt +5 -0
- start_app.py +398 -0
README.md
CHANGED
|
@@ -1,19 +1,371 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
- streamlit
|
| 10 |
pinned: false
|
| 11 |
-
|
| 12 |
---
|
| 13 |
|
| 14 |
-
# Welcome to Streamlit!
|
| 15 |
|
| 16 |
-
Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Visual Search System
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: "1.37.0"
|
| 8 |
+
app_file: app.py
|
|
|
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
---
|
| 12 |
|
|
|
|
| 13 |
|
|
|
|
| 14 |
|
| 15 |
+
|
| 16 |
+
# Modern Visual Search System (CLIP-based)
|
| 17 |
+
|
| 18 |
+
## Overview
|
| 19 |
+
|
| 20 |
+
This project implements a modern, scalable visual search system using OpenAI's CLIP model. It provides both a REST API (FastAPI) and a web interface (Streamlit) for searching large image datasets using natural language queries and image queries, returning the top 5 most visually relevant images.
|
| 21 |
+
|
| 22 |
+
## Features
|
| 23 |
+
|
| 24 |
+
- **REST API**: FastAPI backend with comprehensive endpoints
|
| 25 |
+
- **Web Interface**: Streamlit frontend with intuitive UI
|
| 26 |
+
- **Text Search**: Search images using natural language descriptions
|
| 27 |
+
- **Image Search**: Find similar images by providing an image as query
|
| 28 |
+
- **Image Upload**: Upload images to find similar images in the dataset
|
| 29 |
+
- **Top 5 Results**: Always returns the 5 most similar images
|
| 30 |
+
- **Fast Search**: Uses precomputed embeddings for optimal performance
|
| 31 |
+
- **GPU Acceleration**: Automatic GPU detection and utilization
|
| 32 |
+
- **API Documentation**: Auto-generated Swagger/OpenAPI docs
|
| 33 |
+
|
| 34 |
+
## Architecture
|
| 35 |
+
|
| 36 |
+
```
|
| 37 |
+
βββββββββββββββββββ βββββββββββββββββββ βββββββββββββββββββ
|
| 38 |
+
β Streamlit β β FastAPI β β CLIP Model β
|
| 39 |
+
β Frontend βββββΊβ Backend βββββΊβ (Encoder) β
|
| 40 |
+
β (Port 8501) β β (Port 8000) β β β
|
| 41 |
+
βββββββββββββββββββ βββββββββββββββββββ βββββββββββββββββββ
|
| 42 |
+
β
|
| 43 |
+
βΌ
|
| 44 |
+
βββββββββββββββββββ
|
| 45 |
+
β Image β
|
| 46 |
+
β Database β
|
| 47 |
+
β (images/) β
|
| 48 |
+
βββββββββββββββββββ
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
## Technical Details
|
| 52 |
+
|
| 53 |
+
- **Image & Text Features**: CLIP (ViT-B-32, openai weights)
|
| 54 |
+
- **Search Algorithm**: Cosine similarity in CLIP embedding space
|
| 55 |
+
- **Embedding Dimension**: 512 (default for ViT-B-32)
|
| 56 |
+
- **Performance**: Fast, GPU-accelerated if available
|
| 57 |
+
- **Dependencies**: `open-clip-torch`, `torch`, `fastapi`, `streamlit`, `Pillow`, `numpy`
|
| 58 |
+
|
| 59 |
+
## Quick Start
|
| 60 |
+
|
| 61 |
+
### 1. Install Dependencies
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
pip install -r requirements.txt
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
### 2. Prepare Your Images
|
| 68 |
+
|
| 69 |
+
The application will automatically check and download images for you:
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
# Check image status
|
| 73 |
+
python start_app.py --check-only
|
| 74 |
+
|
| 75 |
+
# Download missing images (if any)
|
| 76 |
+
python start_app.py --download-images
|
| 77 |
+
|
| 78 |
+
# Or let the app handle everything automatically
|
| 79 |
+
python start_app.py
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
**Automatic Features:**
|
| 83 |
+
- β
Checks if images are already downloaded
|
| 84 |
+
- β
Downloads missing images in parallel (up to 20 workers)
|
| 85 |
+
- β
Skips existing images to save time
|
| 86 |
+
- β
Shows progress and statistics
|
| 87 |
+
- β
Handles errors gracefully
|
| 88 |
+
- β
**Smart startup**: Allows application to start with sufficient images (default: 1000+)
|
| 89 |
+
- β
**Flexible requirements**: Can adjust minimum image requirements
|
| 90 |
+
- β
**Graceful degradation**: Works even with partial image database
|
| 91 |
+
|
| 92 |
+
### 3. Start the Application
|
| 93 |
+
|
| 94 |
+
#### Option A: Start Both Services (Recommended)
|
| 95 |
+
```bash
|
| 96 |
+
python start_app.py
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
This will:
|
| 100 |
+
1. Check dependencies
|
| 101 |
+
2. Verify/download images automatically
|
| 102 |
+
3. Start both the FastAPI backend and Streamlit frontend
|
| 103 |
+
|
| 104 |
+
#### Command Line Options
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
# Basic startup (recommended)
|
| 108 |
+
python start_app.py
|
| 109 |
+
|
| 110 |
+
# Skip automatic image download
|
| 111 |
+
python start_app.py --skip-download
|
| 112 |
+
|
| 113 |
+
# Force download images even if some exist
|
| 114 |
+
python start_app.py --download-images
|
| 115 |
+
|
| 116 |
+
# Use custom number of parallel workers
|
| 117 |
+
python start_app.py --max-workers 10
|
| 118 |
+
|
| 119 |
+
# Use custom images directory
|
| 120 |
+
python start_app.py --images-dir my_images
|
| 121 |
+
|
| 122 |
+
# Set custom minimum image requirement
|
| 123 |
+
python start_app.py --min-images 500
|
| 124 |
+
|
| 125 |
+
# Only check image status, don't start services
|
| 126 |
+
python start_app.py --check-only
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
#### Option B: Start Services Separately
|
| 130 |
+
|
| 131 |
+
**Start FastAPI Backend:**
|
| 132 |
+
```bash
|
| 133 |
+
cd api && python main.py
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
**Start Streamlit Frontend (in another terminal):**
|
| 137 |
+
```bash
|
| 138 |
+
streamlit run streamlit_app.py
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
#### Option C: Download Images Only
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
# Download all images
|
| 145 |
+
python download_images.py
|
| 146 |
+
|
| 147 |
+
# Download with custom settings
|
| 148 |
+
python download_images.py --max-workers 15 --output-dir images
|
| 149 |
+
|
| 150 |
+
# Check status only
|
| 151 |
+
python download_images.py --check-only
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
### 4. Access the Application
|
| 155 |
+
|
| 156 |
+
- **Web Interface**: http://localhost:8501
|
| 157 |
+
- **API Documentation**: http://localhost:8000/docs
|
| 158 |
+
- **API Health Check**: http://localhost:8000/api/health
|
| 159 |
+
|
| 160 |
+
## API Endpoints
|
| 161 |
+
|
| 162 |
+
### Core Endpoints
|
| 163 |
+
|
| 164 |
+
| Endpoint | Method | Description |
|
| 165 |
+
|----------|--------|-------------|
|
| 166 |
+
| `/` | GET | Root endpoint with API info |
|
| 167 |
+
| `/api/health` | GET | Health check |
|
| 168 |
+
| `/api/info` | GET | System information |
|
| 169 |
+
| `/api/images` | GET | List available images |
|
| 170 |
+
|
| 171 |
+
### Search Endpoints
|
| 172 |
+
|
| 173 |
+
| Endpoint | Method | Description |
|
| 174 |
+
|----------|--------|-------------|
|
| 175 |
+
| `/api/search/text` | POST | Text-based search |
|
| 176 |
+
| `/api/search/image` | POST | Image-based search |
|
| 177 |
+
| `/api/search/image/upload` | POST | Upload image for search |
|
| 178 |
+
|
| 179 |
+
### Example API Usage
|
| 180 |
+
|
| 181 |
+
#### Text Search
|
| 182 |
+
```bash
|
| 183 |
+
curl -X POST "http://localhost:8000/api/search/text" \
|
| 184 |
+
-H "Content-Type: application/json" \
|
| 185 |
+
-d '{"query": "red car", "top_k": 5}'
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
#### Image Search
|
| 189 |
+
```bash
|
| 190 |
+
curl -X POST "http://localhost:8000/api/search/image" \
|
| 191 |
+
-H "Content-Type: application/json" \
|
| 192 |
+
-d '{"image_path": "images/0001.jpg", "top_k": 5}'
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
#### Image Upload Search
|
| 196 |
+
```bash
|
| 197 |
+
curl -X POST "http://localhost:8000/api/search/image/upload" \
|
| 198 |
+
-F "file=@your_image.jpg" \
|
| 199 |
+
-F "top_k=5"
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
## Web Interface Features
|
| 203 |
+
|
| 204 |
+
### Text Search
|
| 205 |
+
- Enter natural language descriptions
|
| 206 |
+
- Examples: "a cat sitting", "red car", "person walking", "building with windows"
|
| 207 |
+
- Returns images that best match the text description
|
| 208 |
+
|
| 209 |
+
### Image Search
|
| 210 |
+
- Select an image from your dataset
|
| 211 |
+
- Finds images visually similar to the selected image
|
| 212 |
+
- Excludes the query image from results
|
| 213 |
+
|
| 214 |
+
### Image Upload Search
|
| 215 |
+
- Upload any image file (JPG, PNG, BMP, TIFF)
|
| 216 |
+
- Find similar images in your dataset
|
| 217 |
+
- Real-time processing and results
|
| 218 |
+
|
| 219 |
+
### Results Display
|
| 220 |
+
- **Ranking**: 1-5 (top 5 results)
|
| 221 |
+
- **Filename**: Name of the image file
|
| 222 |
+
- **Path**: Full path to the image
|
| 223 |
+
- **Similarity Score**: Percentage indicating how well the image matches the query
|
| 224 |
+
- **Image Preview**: Visual display of results
|
| 225 |
+
|
| 226 |
+
## Testing
|
| 227 |
+
|
| 228 |
+
### Test the API
|
| 229 |
+
```bash
|
| 230 |
+
python test_api.py
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
### Test the CLI (Legacy)
|
| 234 |
+
```bash
|
| 235 |
+
python test_search.py
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
### Run Demo
|
| 239 |
+
```bash
|
| 240 |
+
python demo_search.py
|
| 241 |
+
```
|
| 242 |
+
|
| 243 |
+
## Development
|
| 244 |
+
|
| 245 |
+
### Project Structure
|
| 246 |
+
```
|
| 247 |
+
visual-search-system/
|
| 248 |
+
βββ api/
|
| 249 |
+
β βββ main.py # FastAPI backend
|
| 250 |
+
βββ src/
|
| 251 |
+
β βββ models/
|
| 252 |
+
β βββ multimodal_encoder.py # CLIP encoder
|
| 253 |
+
βββ images/ # Image database
|
| 254 |
+
βββ models/
|
| 255 |
+
β βββ clip_index/ # Precomputed embeddings
|
| 256 |
+
βββ streamlit_app.py # Streamlit frontend
|
| 257 |
+
βββ start_app.py # Startup script
|
| 258 |
+
βββ test_api.py # API tests
|
| 259 |
+
βββ run_search.py # CLI interface (legacy)
|
| 260 |
+
βββ requirements.txt # Dependencies
|
| 261 |
+
```
|
| 262 |
+
|
| 263 |
+
### Adding New Features
|
| 264 |
+
|
| 265 |
+
1. **New API Endpoints**: Add to `api/main.py`
|
| 266 |
+
2. **New UI Components**: Add to `streamlit_app.py`
|
| 267 |
+
3. **New Search Methods**: Extend `src/models/multimodal_encoder.py`
|
| 268 |
+
|
| 269 |
+
## Performance
|
| 270 |
+
|
| 271 |
+
- **Precomputed Embeddings**: Uses existing embeddings in `models/clip_index/` for faster search
|
| 272 |
+
- **Real-time Fallback**: Falls back to real-time encoding if precomputed embeddings aren't available
|
| 273 |
+
- **GPU Acceleration**: Automatically uses GPU if available for faster processing
|
| 274 |
+
- **Memory Efficient**: Processes images in batches to manage memory usage
|
| 275 |
+
- **Async Processing**: FastAPI handles concurrent requests efficiently
|
| 276 |
+
|
| 277 |
+
## Deployment
|
| 278 |
+
|
| 279 |
+
### Hugging Face Spaces (Recommended)
|
| 280 |
+
|
| 281 |
+
This application is designed to work seamlessly on Hugging Face Spaces:
|
| 282 |
+
|
| 283 |
+
1. **Fork this repository** to your Hugging Face account
|
| 284 |
+
2. **Create a new Space** on Hugging Face Spaces
|
| 285 |
+
3. **Connect your forked repository** to the Space
|
| 286 |
+
4. **The application will automatically**:
|
| 287 |
+
- Check for downloaded images
|
| 288 |
+
- Download missing images in parallel
|
| 289 |
+
- Start the Streamlit interface
|
| 290 |
+
- Handle all dependencies automatically
|
| 291 |
+
|
| 292 |
+
**Features for Hugging Face Spaces:**
|
| 293 |
+
- β
Automatic image downloading with progress tracking
|
| 294 |
+
- β
Optimized for Spaces environment (reduced parallel workers)
|
| 295 |
+
- β
Built-in error handling and recovery
|
| 296 |
+
- β
No manual setup required
|
| 297 |
+
- β
Works with the provided `photos_url.csv` dataset
|
| 298 |
+
|
| 299 |
+
**Space Configuration:**
|
| 300 |
+
- **SDK**: Streamlit
|
| 301 |
+
- **Hardware**: CPU Basic (free tier) or GPU for faster processing
|
| 302 |
+
- **App File**: `app.py`
|
| 303 |
+
|
| 304 |
+
### Docker Deployment
|
| 305 |
+
|
| 306 |
+
Create a `Dockerfile`:
|
| 307 |
+
|
| 308 |
+
```Dockerfile
|
| 309 |
+
FROM python:3.9-slim
|
| 310 |
+
|
| 311 |
+
WORKDIR /app
|
| 312 |
+
|
| 313 |
+
# Install system dependencies
|
| 314 |
+
RUN apt-get update && apt-get install -y \
|
| 315 |
+
libgl1-mesa-glx \
|
| 316 |
+
libglib2.0-0 \
|
| 317 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 318 |
+
|
| 319 |
+
# Copy requirements and install Python dependencies
|
| 320 |
+
COPY requirements.txt .
|
| 321 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 322 |
+
|
| 323 |
+
# Copy application code
|
| 324 |
+
COPY . .
|
| 325 |
+
|
| 326 |
+
# Expose ports
|
| 327 |
+
EXPOSE 8000 8501
|
| 328 |
+
|
| 329 |
+
# Start the application
|
| 330 |
+
CMD ["python", "start_app.py"]
|
| 331 |
+
```
|
| 332 |
+
|
| 333 |
+
Build and run:
|
| 334 |
+
```bash
|
| 335 |
+
docker build -t visual-search .
|
| 336 |
+
docker run -p 8000:8000 -p 8501:8501 -v $(pwd)/images:/app/images visual-search
|
| 337 |
+
```
|
| 338 |
+
|
| 339 |
+
### Production Deployment
|
| 340 |
+
|
| 341 |
+
For production, consider:
|
| 342 |
+
- Using a production ASGI server like Gunicorn
|
| 343 |
+
- Setting up proper CORS configuration
|
| 344 |
+
- Implementing authentication
|
| 345 |
+
- Using a reverse proxy (nginx)
|
| 346 |
+
- Setting up monitoring and logging
|
| 347 |
+
|
| 348 |
+
## Troubleshooting
|
| 349 |
+
|
| 350 |
+
### Common Issues
|
| 351 |
+
|
| 352 |
+
- **API Connection Failed**: Make sure the FastAPI server is running on port 8000
|
| 353 |
+
- **No images found**: Ensure images are in the `images/` directory with `.jpg` extension
|
| 354 |
+
- **Model loading errors**: Check that all dependencies are installed correctly
|
| 355 |
+
- **GPU issues**: Install appropriate CUDA version for your GPU
|
| 356 |
+
- **Memory errors**: Reduce batch size or use CPU-only mode
|
| 357 |
+
- **Port conflicts**: Change ports in `start_app.py` if needed
|
| 358 |
+
|
| 359 |
+
### Debug Mode
|
| 360 |
+
|
| 361 |
+
Run with debug information:
|
| 362 |
+
```bash
|
| 363 |
+
# API with debug logging
|
| 364 |
+
cd api && uvicorn main:app --reload --log-level debug
|
| 365 |
+
|
| 366 |
+
# Streamlit with debug
|
| 367 |
+
streamlit run streamlit_app.py --logger.level debug
|
| 368 |
+
```
|
| 369 |
+
|
| 370 |
+
## License
|
| 371 |
+
MIT
|
download_images.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Image Downloader for Photo Dataset
|
| 3 |
+
--------------------------------
|
| 4 |
+
This script downloads and optimizes images from URLs in a photos.csv file with parallel processing.
|
| 5 |
+
|
| 6 |
+
Requirements:
|
| 7 |
+
pip install pandas pillow requests tqdm concurrent.futures
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
1. Ensure photos_url.csv is in the same directory as this script
|
| 11 |
+
2. Run the script: python download_images.py
|
| 12 |
+
3. Images will be downloaded to the 'images' folder
|
| 13 |
+
|
| 14 |
+
Note:
|
| 15 |
+
- Default image size is 800x800 pixels (maintains aspect ratio)
|
| 16 |
+
- Images are saved as optimized JPEGs
|
| 17 |
+
- You can modify num_images parameter to download fewer images
|
| 18 |
+
- Approximate size of the dataset is 1.5GB and total images are 25,000 images
|
| 19 |
+
- Uses parallel downloading for maximum efficiency
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import pandas as pd
|
| 23 |
+
import requests
|
| 24 |
+
import os
|
| 25 |
+
from PIL import Image
|
| 26 |
+
from io import BytesIO
|
| 27 |
+
from tqdm import tqdm
|
| 28 |
+
import concurrent.futures
|
| 29 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 30 |
+
import time
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
def download_single_image(args):
|
| 34 |
+
"""
|
| 35 |
+
Download a single image with error handling
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
args: Tuple of (idx, url, output_path, target_size)
|
| 39 |
+
|
| 40 |
+
Returns:
|
| 41 |
+
Tuple of (success, idx, error_message)
|
| 42 |
+
"""
|
| 43 |
+
idx, url, output_path, target_size = args
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
# Skip if file already exists
|
| 47 |
+
if os.path.exists(output_path):
|
| 48 |
+
return True, idx, "Already exists"
|
| 49 |
+
|
| 50 |
+
response = requests.get(url, timeout=15, stream=True)
|
| 51 |
+
if response.status_code == 200:
|
| 52 |
+
# Process image
|
| 53 |
+
img = Image.open(BytesIO(response.content))
|
| 54 |
+
if img.mode in ('RGBA', 'P'):
|
| 55 |
+
img = img.convert('RGB')
|
| 56 |
+
img.thumbnail(target_size, Image.Resampling.LANCZOS)
|
| 57 |
+
img.save(output_path, 'JPEG', quality=85, optimize=True)
|
| 58 |
+
return True, idx, None
|
| 59 |
+
else:
|
| 60 |
+
return False, idx, f"HTTP {response.status_code}"
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return False, idx, str(e)
|
| 64 |
+
|
| 65 |
+
def check_images_downloaded(output_dir="images", expected_count=None):
|
| 66 |
+
"""
|
| 67 |
+
Check if images are already downloaded
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
output_dir: Directory to check for images
|
| 71 |
+
expected_count: Expected number of images (optional)
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
Tuple of (is_complete, current_count, missing_count)
|
| 75 |
+
"""
|
| 76 |
+
images_dir = Path(output_dir)
|
| 77 |
+
if not images_dir.exists():
|
| 78 |
+
return False, 0, expected_count or 0
|
| 79 |
+
|
| 80 |
+
# Count existing images
|
| 81 |
+
existing_images = list(images_dir.glob("*.jpg"))
|
| 82 |
+
current_count = len(existing_images)
|
| 83 |
+
|
| 84 |
+
if expected_count is None:
|
| 85 |
+
# Try to get expected count from CSV
|
| 86 |
+
try:
|
| 87 |
+
df = pd.read_csv("photos_url.csv")
|
| 88 |
+
expected_count = len(df)
|
| 89 |
+
except:
|
| 90 |
+
expected_count = current_count
|
| 91 |
+
|
| 92 |
+
missing_count = max(0, expected_count - current_count)
|
| 93 |
+
is_complete = missing_count == 0
|
| 94 |
+
|
| 95 |
+
return is_complete, current_count, missing_count
|
| 96 |
+
|
| 97 |
+
def download_images(num_images=None, output_dir="images", target_size=(800, 800), max_workers=20):
|
| 98 |
+
"""
|
| 99 |
+
Download and optimize images from photos.csv with parallel processing
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
num_images: Number of images to download (default: all images in CSV)
|
| 103 |
+
output_dir: Directory to save images (default: 'images')
|
| 104 |
+
target_size: Max image dimensions (default: (800, 800))
|
| 105 |
+
max_workers: Maximum number of parallel download threads (default: 20)
|
| 106 |
+
"""
|
| 107 |
+
# Create output directory
|
| 108 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 109 |
+
|
| 110 |
+
# Read CSV and prepare dataset
|
| 111 |
+
df = pd.read_csv("photos_url.csv")
|
| 112 |
+
if num_images:
|
| 113 |
+
df = df.head(num_images)
|
| 114 |
+
|
| 115 |
+
total_images = len(df)
|
| 116 |
+
print(f"π Total images to process: {total_images:,}")
|
| 117 |
+
|
| 118 |
+
# Check existing images
|
| 119 |
+
is_complete, current_count, missing_count = check_images_downloaded(output_dir, total_images)
|
| 120 |
+
|
| 121 |
+
if is_complete:
|
| 122 |
+
print(f"β
All {current_count:,} images are already downloaded!")
|
| 123 |
+
return True
|
| 124 |
+
|
| 125 |
+
print(f"π₯ Found {current_count:,} existing images, need to download {missing_count:,} more")
|
| 126 |
+
|
| 127 |
+
# Prepare download tasks - only for missing images
|
| 128 |
+
download_tasks = []
|
| 129 |
+
for idx, row in df.iterrows():
|
| 130 |
+
filename = f"{(idx+1):04d}.jpg"
|
| 131 |
+
output_path = os.path.join(output_dir, filename)
|
| 132 |
+
|
| 133 |
+
# Only add to download tasks if file doesn't exist
|
| 134 |
+
if not os.path.exists(output_path):
|
| 135 |
+
download_tasks.append((idx, row['photo_image_url'], output_path, target_size))
|
| 136 |
+
|
| 137 |
+
if not download_tasks:
|
| 138 |
+
print("β
All images are already downloaded!")
|
| 139 |
+
return True
|
| 140 |
+
|
| 141 |
+
print(f"π Starting parallel download of {len(download_tasks):,} missing images with {max_workers} workers...")
|
| 142 |
+
start_time = time.time()
|
| 143 |
+
|
| 144 |
+
successful_downloads = 0
|
| 145 |
+
failed_downloads = 0
|
| 146 |
+
skipped_downloads = 0
|
| 147 |
+
|
| 148 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 149 |
+
# Submit all tasks
|
| 150 |
+
future_to_task = {executor.submit(download_single_image, task): task for task in download_tasks}
|
| 151 |
+
|
| 152 |
+
# Process completed tasks with progress bar
|
| 153 |
+
with tqdm(total=len(download_tasks), desc="Downloading images") as pbar:
|
| 154 |
+
for future in as_completed(future_to_task):
|
| 155 |
+
success, idx, error = future.result()
|
| 156 |
+
|
| 157 |
+
if success:
|
| 158 |
+
if error == "Already exists":
|
| 159 |
+
skipped_downloads += 1
|
| 160 |
+
else:
|
| 161 |
+
successful_downloads += 1
|
| 162 |
+
else:
|
| 163 |
+
failed_downloads += 1
|
| 164 |
+
if error and error != "Already exists":
|
| 165 |
+
# Only show error for non-404 errors to reduce noise
|
| 166 |
+
if "404" not in str(error) and "NameResolutionError" not in str(error):
|
| 167 |
+
print(f"β Failed to download image {idx+1}: {error}")
|
| 168 |
+
|
| 169 |
+
pbar.update(1)
|
| 170 |
+
|
| 171 |
+
end_time = time.time()
|
| 172 |
+
duration = end_time - start_time
|
| 173 |
+
|
| 174 |
+
print(f"\nπ Download Summary:")
|
| 175 |
+
print(f" β
New downloads: {successful_downloads:,}")
|
| 176 |
+
print(f" βοΈ Skipped (already exist): {skipped_downloads:,}")
|
| 177 |
+
print(f" β Failed: {failed_downloads:,}")
|
| 178 |
+
print(f" β±οΈ Duration: {duration:.1f} seconds")
|
| 179 |
+
if duration > 0:
|
| 180 |
+
print(f" π Speed: {successful_downloads/duration:.1f} images/second")
|
| 181 |
+
|
| 182 |
+
# Final check
|
| 183 |
+
is_complete, final_count, final_missing = check_images_downloaded(output_dir, total_images)
|
| 184 |
+
|
| 185 |
+
if final_count >= total_images * 0.95: # Consider successful if we have 95% or more
|
| 186 |
+
print(f"π Download completed! Now have {final_count:,} images ({final_missing:,} missing)")
|
| 187 |
+
return True
|
| 188 |
+
elif final_count > current_count:
|
| 189 |
+
print(f"β
Download partially successful! Now have {final_count:,} images ({final_missing:,} missing)")
|
| 190 |
+
return True
|
| 191 |
+
else:
|
| 192 |
+
print(f"β οΈ Download had issues. Still have {final_count:,} images ({final_missing:,} missing)")
|
| 193 |
+
return False
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
import argparse
|
| 197 |
+
|
| 198 |
+
parser = argparse.ArgumentParser(description="Download images with parallel processing")
|
| 199 |
+
parser.add_argument("--num-images", type=int, default=None, help="Number of images to download (default: all)")
|
| 200 |
+
parser.add_argument("--output-dir", type=str, default="images", help="Output directory (default: images)")
|
| 201 |
+
parser.add_argument("--max-workers", type=int, default=20, help="Maximum parallel workers (default: 20)")
|
| 202 |
+
parser.add_argument("--check-only", action="store_true", help="Only check if images are downloaded")
|
| 203 |
+
|
| 204 |
+
args = parser.parse_args()
|
| 205 |
+
|
| 206 |
+
if args.check_only:
|
| 207 |
+
# Just check the status
|
| 208 |
+
is_complete, current_count, missing_count = check_images_downloaded(args.output_dir)
|
| 209 |
+
if is_complete:
|
| 210 |
+
print(f"β
All {current_count:,} images are downloaded!")
|
| 211 |
+
else:
|
| 212 |
+
print(f"π Status: {current_count:,} downloaded, {missing_count:,} missing")
|
| 213 |
+
else:
|
| 214 |
+
# Download images
|
| 215 |
+
success = download_images(
|
| 216 |
+
num_images=args.num_images,
|
| 217 |
+
output_dir=args.output_dir,
|
| 218 |
+
max_workers=args.max_workers
|
| 219 |
+
)
|
| 220 |
+
exit(0 if success else 1)
|
photos_url.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28.0
|
| 2 |
+
pandas>=1.5.0
|
| 3 |
+
requests>=2.28.0
|
| 4 |
+
Pillow>=9.0.0
|
| 5 |
+
tqdm>=4.64.0
|
start_app.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Visual Search System - Complete Streamlit App
|
| 4 |
+
============================================
|
| 5 |
+
|
| 6 |
+
A comprehensive Streamlit application that:
|
| 7 |
+
1. Automatically installs required dependencies
|
| 8 |
+
2. Downloads images from photos_url.csv if needed
|
| 9 |
+
3. Provides a clean UI for searching and viewing images
|
| 10 |
+
4. Supports both search by ID and range by block functionality
|
| 11 |
+
|
| 12 |
+
Requirements:
|
| 13 |
+
- photos_url.csv: Contains image URLs
|
| 14 |
+
- download_images.py: Contains parallel downloading logic
|
| 15 |
+
- images/ folder: Will be created and populated with downloaded images
|
| 16 |
+
|
| 17 |
+
Usage:
|
| 18 |
+
streamlit run start_app.py
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import os
|
| 22 |
+
import sys
|
| 23 |
+
import subprocess
|
| 24 |
+
import importlib
|
| 25 |
+
from pathlib import Path
|
| 26 |
+
import pandas as pd
|
| 27 |
+
import streamlit as st
|
| 28 |
+
from typing import List, Tuple, Optional
|
| 29 |
+
import time
|
| 30 |
+
|
| 31 |
+
# Configuration
|
| 32 |
+
REQUIRED_PACKAGES = [
|
| 33 |
+
"streamlit",
|
| 34 |
+
"pandas",
|
| 35 |
+
"requests",
|
| 36 |
+
"PIL",
|
| 37 |
+
"tqdm"
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
IMAGES_DIR = "images"
|
| 41 |
+
CSV_FILE = "photos_url.csv"
|
| 42 |
+
DOWNLOAD_SCRIPT = "download_images.py"
|
| 43 |
+
MAX_DISPLAY_IMAGES = 500
|
| 44 |
+
IMAGES_PER_BLOCK = 100
|
| 45 |
+
TOTAL_BLOCKS = 250
|
| 46 |
+
|
| 47 |
+
def install_package(package: str) -> bool:
|
| 48 |
+
"""
|
| 49 |
+
Install a Python package using pip
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
package: Package name to install
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
True if successful, False otherwise
|
| 56 |
+
"""
|
| 57 |
+
try:
|
| 58 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
| 59 |
+
return True
|
| 60 |
+
except subprocess.CalledProcessError:
|
| 61 |
+
return False
|
| 62 |
+
|
| 63 |
+
def check_and_install_dependencies() -> bool:
|
| 64 |
+
"""
|
| 65 |
+
Check if required packages are installed, install if missing
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
True if all dependencies are available, False otherwise
|
| 69 |
+
"""
|
| 70 |
+
print("π Checking dependencies...")
|
| 71 |
+
|
| 72 |
+
missing_packages = []
|
| 73 |
+
|
| 74 |
+
for package in REQUIRED_PACKAGES:
|
| 75 |
+
try:
|
| 76 |
+
importlib.import_module(package)
|
| 77 |
+
print(f"β
{package} is already installed")
|
| 78 |
+
except ImportError:
|
| 79 |
+
print(f"π¦ Installing {package}...")
|
| 80 |
+
missing_packages.append(package)
|
| 81 |
+
|
| 82 |
+
if missing_packages:
|
| 83 |
+
print(f"π Installing {len(missing_packages)} missing packages...")
|
| 84 |
+
|
| 85 |
+
for package in missing_packages:
|
| 86 |
+
print(f"π₯ Installing {package}...")
|
| 87 |
+
if install_package(package):
|
| 88 |
+
print(f"β
Successfully installed {package}")
|
| 89 |
+
else:
|
| 90 |
+
print(f"β Failed to install {package}")
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
# Verify installations
|
| 94 |
+
for package in missing_packages:
|
| 95 |
+
try:
|
| 96 |
+
importlib.import_module(package)
|
| 97 |
+
print(f"β
{package} verified after installation")
|
| 98 |
+
except ImportError:
|
| 99 |
+
print(f"β {package} still not available after installation")
|
| 100 |
+
return False
|
| 101 |
+
|
| 102 |
+
print("β
All dependencies are available!")
|
| 103 |
+
return True
|
| 104 |
+
|
| 105 |
+
def check_images_status() -> Tuple[bool, int, int]:
|
| 106 |
+
"""
|
| 107 |
+
Check the status of downloaded images
|
| 108 |
+
|
| 109 |
+
Returns:
|
| 110 |
+
Tuple of (is_complete, current_count, total_count)
|
| 111 |
+
"""
|
| 112 |
+
images_path = Path(IMAGES_DIR)
|
| 113 |
+
|
| 114 |
+
if not images_path.exists():
|
| 115 |
+
return False, 0, 0
|
| 116 |
+
|
| 117 |
+
# Count existing images
|
| 118 |
+
existing_images = list(images_path.glob("*.jpg"))
|
| 119 |
+
current_count = len(existing_images)
|
| 120 |
+
|
| 121 |
+
# Get total count from CSV
|
| 122 |
+
try:
|
| 123 |
+
df = pd.read_csv(CSV_FILE)
|
| 124 |
+
total_count = len(df)
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f"β Error reading {CSV_FILE}: {e}")
|
| 127 |
+
return False, current_count, 0
|
| 128 |
+
|
| 129 |
+
is_complete = current_count >= total_count * 0.95 # Consider complete if 95%+ downloaded
|
| 130 |
+
|
| 131 |
+
return is_complete, current_count, total_count
|
| 132 |
+
|
| 133 |
+
def download_images_if_needed() -> bool:
|
| 134 |
+
"""
|
| 135 |
+
Download images if they're missing or incomplete
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
True if images are available, False otherwise
|
| 139 |
+
"""
|
| 140 |
+
print("π Checking image status...")
|
| 141 |
+
|
| 142 |
+
is_complete, current_count, total_count = check_images_status()
|
| 143 |
+
|
| 144 |
+
if is_complete:
|
| 145 |
+
print(f"β
Images are ready! Have {current_count:,} of {total_count:,} images")
|
| 146 |
+
return True
|
| 147 |
+
|
| 148 |
+
print(f"π₯ Images incomplete: {current_count:,} of {total_count:,} available")
|
| 149 |
+
print("π Starting image download...")
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
# Import download functions from download_images.py
|
| 153 |
+
sys.path.append('.')
|
| 154 |
+
from download_images import download_images
|
| 155 |
+
|
| 156 |
+
success = download_images(
|
| 157 |
+
num_images=None, # Download all images
|
| 158 |
+
output_dir=IMAGES_DIR,
|
| 159 |
+
max_workers=20
|
| 160 |
+
)
|
| 161 |
+
|
| 162 |
+
if success:
|
| 163 |
+
print("β
Image download completed successfully!")
|
| 164 |
+
return True
|
| 165 |
+
else:
|
| 166 |
+
print("β οΈ Image download had some issues, but continuing...")
|
| 167 |
+
return True
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
print(f"β Error during image download: {e}")
|
| 171 |
+
return False
|
| 172 |
+
|
| 173 |
+
def get_image_path(image_id: str) -> Optional[str]:
|
| 174 |
+
"""
|
| 175 |
+
Get the file path for a given image ID
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
image_id: Image ID (e.g., "0001", "1234")
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
File path if exists, None otherwise
|
| 182 |
+
"""
|
| 183 |
+
try:
|
| 184 |
+
# Convert image ID to filename format
|
| 185 |
+
if image_id.isdigit():
|
| 186 |
+
filename = f"{int(image_id):04d}.jpg"
|
| 187 |
+
else:
|
| 188 |
+
filename = f"{image_id}.jpg"
|
| 189 |
+
|
| 190 |
+
image_path = os.path.join(IMAGES_DIR, filename)
|
| 191 |
+
|
| 192 |
+
if os.path.exists(image_path):
|
| 193 |
+
return image_path
|
| 194 |
+
else:
|
| 195 |
+
return None
|
| 196 |
+
except:
|
| 197 |
+
return None
|
| 198 |
+
|
| 199 |
+
def get_block_images(block_number: int) -> List[str]:
|
| 200 |
+
"""
|
| 201 |
+
Get all images for a specific block
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
block_number: Block number (1-250)
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
List of image paths for the block
|
| 208 |
+
"""
|
| 209 |
+
if not (1 <= block_number <= TOTAL_BLOCKS):
|
| 210 |
+
return []
|
| 211 |
+
|
| 212 |
+
# Calculate start and end image numbers for this block
|
| 213 |
+
start_num = (block_number - 1) * IMAGES_PER_BLOCK + 1
|
| 214 |
+
end_num = block_number * IMAGES_PER_BLOCK
|
| 215 |
+
|
| 216 |
+
image_paths = []
|
| 217 |
+
|
| 218 |
+
for i in range(start_num, end_num + 1):
|
| 219 |
+
image_path = get_image_path(str(i))
|
| 220 |
+
if image_path:
|
| 221 |
+
image_paths.append(image_path)
|
| 222 |
+
|
| 223 |
+
return image_paths
|
| 224 |
+
|
| 225 |
+
def search_images_by_id(search_id: str) -> List[str]:
|
| 226 |
+
"""
|
| 227 |
+
Search for images by ID
|
| 228 |
+
|
| 229 |
+
Args:
|
| 230 |
+
search_id: Search term (can be partial)
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
List of matching image paths
|
| 234 |
+
"""
|
| 235 |
+
if not search_id.strip():
|
| 236 |
+
# Return first 500 images if no search term
|
| 237 |
+
return [get_image_path(str(i)) for i in range(1, MAX_DISPLAY_IMAGES + 1)
|
| 238 |
+
if get_image_path(str(i))]
|
| 239 |
+
|
| 240 |
+
# Search for exact or partial matches
|
| 241 |
+
matching_paths = []
|
| 242 |
+
|
| 243 |
+
# Try exact match first
|
| 244 |
+
exact_path = get_image_path(search_id)
|
| 245 |
+
if exact_path:
|
| 246 |
+
matching_paths.append(exact_path)
|
| 247 |
+
|
| 248 |
+
# Search for partial matches
|
| 249 |
+
for i in range(1, 25001): # Total images in dataset
|
| 250 |
+
image_path = get_image_path(str(i))
|
| 251 |
+
if image_path and search_id.lower() in str(i):
|
| 252 |
+
if image_path not in matching_paths:
|
| 253 |
+
matching_paths.append(image_path)
|
| 254 |
+
if len(matching_paths) >= MAX_DISPLAY_IMAGES:
|
| 255 |
+
break
|
| 256 |
+
|
| 257 |
+
return matching_paths
|
| 258 |
+
|
| 259 |
+
def display_image_grid(image_paths: List[str], title: str):
|
| 260 |
+
"""
|
| 261 |
+
Display a grid of images using Streamlit
|
| 262 |
+
|
| 263 |
+
Args:
|
| 264 |
+
image_paths: List of image file paths
|
| 265 |
+
title: Title for the image grid
|
| 266 |
+
"""
|
| 267 |
+
if not image_paths:
|
| 268 |
+
st.warning("No images found matching your criteria.")
|
| 269 |
+
return
|
| 270 |
+
|
| 271 |
+
st.subheader(f"{title} ({len(image_paths)} images)")
|
| 272 |
+
|
| 273 |
+
# Create columns for the grid (3 columns)
|
| 274 |
+
cols = st.columns(3)
|
| 275 |
+
|
| 276 |
+
for idx, image_path in enumerate(image_paths):
|
| 277 |
+
col_idx = idx % 3
|
| 278 |
+
with cols[col_idx]:
|
| 279 |
+
try:
|
| 280 |
+
st.image(image_path, caption=f"Image {os.path.basename(image_path)}", use_column_width=True)
|
| 281 |
+
except Exception as e:
|
| 282 |
+
st.error(f"Error loading image: {e}")
|
| 283 |
+
|
| 284 |
+
def main():
|
| 285 |
+
"""Main Streamlit application"""
|
| 286 |
+
|
| 287 |
+
# Page configuration
|
| 288 |
+
st.set_page_config(
|
| 289 |
+
page_title="Visual Search System",
|
| 290 |
+
page_icon="π",
|
| 291 |
+
layout="wide",
|
| 292 |
+
initial_sidebar_state="expanded"
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
# Main title
|
| 296 |
+
st.title("π Visual Search System")
|
| 297 |
+
st.markdown("---")
|
| 298 |
+
|
| 299 |
+
# Sidebar for navigation
|
| 300 |
+
st.sidebar.header("Navigation")
|
| 301 |
+
search_option = st.sidebar.selectbox(
|
| 302 |
+
"Choose search method:",
|
| 303 |
+
["Search by ID", "Range by Block"]
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
# Main content area
|
| 307 |
+
if search_option == "Search by ID":
|
| 308 |
+
st.header("π Search Images by ID")
|
| 309 |
+
|
| 310 |
+
# Search input
|
| 311 |
+
search_id = st.text_input(
|
| 312 |
+
"Enter image ID (e.g., '0001', '1234') or leave empty to see first 500 images:",
|
| 313 |
+
placeholder="Enter ID or leave empty",
|
| 314 |
+
help="Enter a specific image ID or leave empty to browse the first 500 images"
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
+
# Search button
|
| 318 |
+
if st.button("π Search", type="primary") or search_id != "":
|
| 319 |
+
with st.spinner("Searching images..."):
|
| 320 |
+
matching_images = search_images_by_id(search_id)
|
| 321 |
+
|
| 322 |
+
if matching_images:
|
| 323 |
+
display_image_grid(
|
| 324 |
+
matching_images,
|
| 325 |
+
f"Showing {len(matching_images)} matching images"
|
| 326 |
+
)
|
| 327 |
+
else:
|
| 328 |
+
st.info("No images found matching your search criteria.")
|
| 329 |
+
|
| 330 |
+
else: # Range by Block
|
| 331 |
+
st.header("π¦ Browse Images by Block")
|
| 332 |
+
|
| 333 |
+
st.markdown(f"""
|
| 334 |
+
**How it works:**
|
| 335 |
+
- Each block contains **{IMAGES_PER_BLOCK} images**
|
| 336 |
+
- Enter a number between **1 and {TOTAL_BLOCKS}**
|
| 337 |
+
- Example: Enter **100** to see images **10001-10100**
|
| 338 |
+
""")
|
| 339 |
+
|
| 340 |
+
# Block input
|
| 341 |
+
block_number = st.number_input(
|
| 342 |
+
f"Enter block number (1-{TOTAL_BLOCKS}):",
|
| 343 |
+
min_value=1,
|
| 344 |
+
max_value=TOTAL_BLOCKS,
|
| 345 |
+
value=1,
|
| 346 |
+
step=1,
|
| 347 |
+
help=f"Choose a block number from 1 to {TOTAL_BLOCKS}"
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# Calculate and display block info
|
| 351 |
+
start_num = (block_number - 1) * IMAGES_PER_BLOCK + 1
|
| 352 |
+
end_num = block_number * IMAGES_PER_BLOCK
|
| 353 |
+
|
| 354 |
+
st.info(f"**Block {block_number}**: Images {start_num:,} to {end_num:,}")
|
| 355 |
+
|
| 356 |
+
# Get block images
|
| 357 |
+
with st.spinner(f"Loading block {block_number}..."):
|
| 358 |
+
block_images = get_block_images(block_number)
|
| 359 |
+
|
| 360 |
+
if block_images:
|
| 361 |
+
display_image_grid(
|
| 362 |
+
block_images,
|
| 363 |
+
f"Block {block_number} - Images {start_num:,} to {end_num:,}"
|
| 364 |
+
)
|
| 365 |
+
else:
|
| 366 |
+
st.warning(f"No images found for block {block_number}.")
|
| 367 |
+
|
| 368 |
+
# Footer
|
| 369 |
+
st.markdown("---")
|
| 370 |
+
st.markdown(
|
| 371 |
+
"**Dataset Info:** 25,000+ high-quality images from Unsplash | "
|
| 372 |
+
"Built with Streamlit and Python"
|
| 373 |
+
)
|
| 374 |
+
|
| 375 |
+
def setup_and_run():
|
| 376 |
+
"""Setup dependencies and run the app"""
|
| 377 |
+
print("π Starting Visual Search System...")
|
| 378 |
+
|
| 379 |
+
# Step 1: Install dependencies
|
| 380 |
+
if not check_and_install_dependencies():
|
| 381 |
+
print("β Failed to install dependencies. Exiting.")
|
| 382 |
+
sys.exit(1)
|
| 383 |
+
|
| 384 |
+
print("β
Dependencies ready!")
|
| 385 |
+
|
| 386 |
+
# Step 2: Check and download images
|
| 387 |
+
if not download_images_if_needed():
|
| 388 |
+
print("β Failed to prepare images. Exiting.")
|
| 389 |
+
sys.exit(1)
|
| 390 |
+
|
| 391 |
+
print("β
Images ready!")
|
| 392 |
+
|
| 393 |
+
# Step 3: Launch Streamlit app
|
| 394 |
+
print("π Launching Streamlit app...")
|
| 395 |
+
main()
|
| 396 |
+
|
| 397 |
+
if __name__ == "__main__":
|
| 398 |
+
setup_and_run()
|