crosse712 commited on
Commit
6bfe886
·
1 Parent(s): 37b127f

Add full deployment configuration with low-RAM optimization (3-8GB support)

Browse files
Files changed (7) hide show
  1. Dockerfile +61 -0
  2. README_HF.md +19 -0
  3. backend/Procfile +1 -0
  4. backend/README.md +28 -0
  5. backend/app.py +18 -0
  6. docker-compose.yml +39 -0
  7. fly.toml +48 -0
Dockerfile ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multi-stage build for optimized image size
2
+ FROM python:3.9-slim as builder
3
+
4
+ WORKDIR /app
5
+
6
+ # Install build dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ gcc \
9
+ g++ \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy and install Python dependencies
14
+ COPY backend/requirements.txt .
15
+ RUN pip install --no-cache-dir -r requirements.txt
16
+
17
+ # Production stage
18
+ FROM python:3.9-slim
19
+
20
+ WORKDIR /app
21
+
22
+ # Install runtime dependencies
23
+ RUN apt-get update && apt-get install -y \
24
+ libgomp1 \
25
+ libglib2.0-0 \
26
+ libsm6 \
27
+ libxext6 \
28
+ libxrender1 \
29
+ libgomp1 \
30
+ wget \
31
+ && rm -rf /var/lib/apt/lists/*
32
+
33
+ # Copy Python packages from builder
34
+ COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
35
+ COPY --from=builder /usr/local/bin /usr/local/bin
36
+
37
+ # Copy application code
38
+ COPY backend/ ./backend/
39
+
40
+ # Set environment variables for memory optimization
41
+ ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512
42
+ ENV OMP_NUM_THREADS=4
43
+ ENV MKL_NUM_THREADS=4
44
+ ENV NUMEXPR_NUM_THREADS=4
45
+ ENV TOKENIZERS_PARALLELISM=false
46
+
47
+ # Enable extreme memory optimization
48
+ ENV USE_EXTREME_OPTIMIZATION=true
49
+ ENV MAX_MEMORY_GB=3
50
+
51
+ WORKDIR /app/backend
52
+
53
+ # Expose port
54
+ EXPOSE 8000
55
+
56
+ # Health check
57
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
58
+ CMD curl -f http://localhost:8000/ || exit 1
59
+
60
+ # Start the application with memory-limited configuration
61
+ CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]
README_HF.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FastVLM Screen Observer
3
+ emoji: 🖥️
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ sdk_version: "3.9"
8
+ app_port: 8000
9
+ pinned: false
10
+ models:
11
+ - apple/FastVLM-7B
12
+ suggested_hardware: t4-small
13
+ ---
14
+
15
+ # FastVLM Screen Observer
16
+
17
+ Real-time screen observation using FastVLM-7B with extreme memory optimization for 3-8GB RAM systems.
18
+
19
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
backend/Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: cd backend && python -m uvicorn app.main:app --host 0.0.0.0 --port $PORT --workers 1
backend/README.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FastVLM Screen Observer
3
+ emoji: 🖥️
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ sdk_version: 3.9
8
+ app_file: app/main.py
9
+ pinned: false
10
+ models:
11
+ - apple/FastVLM-7B
12
+ ---
13
+
14
+ # FastVLM Screen Observer Backend
15
+
16
+ Real-time screen observation and analysis using Apple's FastVLM-7B model.
17
+
18
+ ## Requirements
19
+ - 14GB+ RAM for model weights
20
+ - GPU (CUDA or MPS) recommended
21
+ - Python 3.9+
22
+
23
+ ## API Endpoints
24
+ - `GET /` - Status check
25
+ - `POST /analyze` - Screen analysis
26
+ - `POST /demo` - Automation demo
27
+ - `GET /export` - Export logs
28
+ - `GET /logs/stream` - Stream logs via SSE
backend/app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio wrapper for Hugging Face Spaces deployment
3
+ """
4
+ import gradio as gr
5
+ from app.main import app
6
+ from fastapi import FastAPI
7
+ import uvicorn
8
+
9
+ # Create a Gradio interface that wraps the FastAPI app
10
+ def create_gradio_app():
11
+ # This allows the FastAPI to run alongside Gradio
12
+ return gr.mount_gradio_app(app, path="/")
13
+
14
+ # For Hugging Face Spaces
15
+ demo = create_gradio_app()
16
+
17
+ if __name__ == "__main__":
18
+ demo.launch()
docker-compose.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ backend:
5
+ build: .
6
+ ports:
7
+ - "8000:8000"
8
+ environment:
9
+ - USE_EXTREME_OPTIMIZATION=true
10
+ - MAX_MEMORY_GB=3
11
+ - PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:256
12
+ - OMP_NUM_THREADS=2
13
+ volumes:
14
+ - model_cache:/root/.cache/huggingface
15
+ - ./logs:/app/backend/logs
16
+ deploy:
17
+ resources:
18
+ limits:
19
+ memory: 3G
20
+ reservations:
21
+ memory: 2G
22
+ restart: unless-stopped
23
+
24
+ frontend:
25
+ image: node:18-alpine
26
+ working_dir: /app
27
+ volumes:
28
+ - ./frontend:/app
29
+ command: sh -c "npm install && npm run build && npm run preview -- --host 0.0.0.0 --port 5173"
30
+ ports:
31
+ - "5173:5173"
32
+ environment:
33
+ - VITE_API_URL=http://localhost:8000
34
+ depends_on:
35
+ - backend
36
+
37
+ volumes:
38
+ model_cache:
39
+ driver: local
fly.toml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fly.io configuration for FastVLM with memory optimization
2
+ app = "fastvlm-screen-observer"
3
+ primary_region = "sjc"
4
+
5
+ [build]
6
+ dockerfile = "Dockerfile"
7
+
8
+ [env]
9
+ USE_EXTREME_OPTIMIZATION = "true"
10
+ MAX_MEMORY_GB = "3"
11
+ PORT = "8000"
12
+
13
+ [experimental]
14
+ allowed_public_ports = []
15
+ auto_rollback = true
16
+
17
+ [[services]]
18
+ http_checks = []
19
+ internal_port = 8000
20
+ processes = ["app"]
21
+ protocol = "tcp"
22
+ script_checks = []
23
+
24
+ [services.concurrency]
25
+ hard_limit = 25
26
+ soft_limit = 20
27
+ type = "connections"
28
+
29
+ [[services.ports]]
30
+ force_https = true
31
+ handlers = ["http"]
32
+ port = 80
33
+
34
+ [[services.ports]]
35
+ handlers = ["tls", "http"]
36
+ port = 443
37
+
38
+ [[services.tcp_checks]]
39
+ grace_period = "60s"
40
+ interval = "30s"
41
+ restart_limit = 0
42
+ timeout = "10s"
43
+
44
+ # Request 4GB RAM (minimum for 4-bit quantized model)
45
+ [[vm]]
46
+ cpu_kind = "shared"
47
+ cpus = 2
48
+ memory_mb = 4096