Spaces:
Sleeping
Sleeping
yonnel
commited on
Commit
Β·
66fef64
0
Parent(s):
Initial clean commit - FastAPI movie backend without large data files
Browse files- .env.example +14 -0
- .gitattributes +36 -0
- .gitignore +57 -0
- Dockerfile +32 -0
- README.md +42 -0
- README_HF.md +42 -0
- app/__init__.py +6 -0
- app/build_index.py +485 -0
- app/main.py +303 -0
- app/settings.py +35 -0
- app/test_api.py +80 -0
- app/test_setup.py +121 -0
- requirements.txt +12 -0
.env.example
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenAI API key for embeddings
|
| 2 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 3 |
+
|
| 4 |
+
# TMDB API key for movie data
|
| 5 |
+
TMDB_API_KEY=your_tmdb_api_key_here
|
| 6 |
+
|
| 7 |
+
# API authentication token
|
| 8 |
+
API_TOKEN=your_api_token_here
|
| 9 |
+
|
| 10 |
+
# Environment (dev/prod)
|
| 11 |
+
ENV=dev
|
| 12 |
+
|
| 13 |
+
# Logging level
|
| 14 |
+
LOG_LEVEL=INFO
|
.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.index filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
pip-wheel-metadata/
|
| 20 |
+
share/python-wheels/
|
| 21 |
+
*.egg-info/
|
| 22 |
+
.installed.cfg
|
| 23 |
+
*.egg
|
| 24 |
+
MANIFEST
|
| 25 |
+
|
| 26 |
+
# Environment
|
| 27 |
+
.env
|
| 28 |
+
.venv
|
| 29 |
+
env/
|
| 30 |
+
venv/
|
| 31 |
+
ENV/
|
| 32 |
+
env.bak/
|
| 33 |
+
venv.bak/
|
| 34 |
+
|
| 35 |
+
# Data files (these are large and will be generated on deployment)
|
| 36 |
+
app/data/*.npy
|
| 37 |
+
app/data/*.index
|
| 38 |
+
app/data/movie_metadata.json
|
| 39 |
+
app/data/id_map.json
|
| 40 |
+
app/data/checkpoints/
|
| 41 |
+
|
| 42 |
+
# IDE
|
| 43 |
+
.vscode/
|
| 44 |
+
.idea/
|
| 45 |
+
*.swp
|
| 46 |
+
*.swo
|
| 47 |
+
|
| 48 |
+
# OS
|
| 49 |
+
.DS_Store
|
| 50 |
+
Thumbs.db
|
| 51 |
+
|
| 52 |
+
# Logs
|
| 53 |
+
*.log
|
| 54 |
+
|
| 55 |
+
# Temporary files
|
| 56 |
+
tmp/
|
| 57 |
+
temp/
|
Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
# Install system dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
gcc \
|
| 6 |
+
g++ \
|
| 7 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
# Set working directory
|
| 10 |
+
WORKDIR /app
|
| 11 |
+
|
| 12 |
+
# Copy requirements first for better caching
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
|
| 15 |
+
# Install Python dependencies
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Copy application code
|
| 19 |
+
COPY app/ ./app/
|
| 20 |
+
|
| 21 |
+
# Create data directory
|
| 22 |
+
RUN mkdir -p app/data
|
| 23 |
+
|
| 24 |
+
# Expose port
|
| 25 |
+
EXPOSE 7860
|
| 26 |
+
|
| 27 |
+
# Health check
|
| 28 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 29 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 30 |
+
|
| 31 |
+
# Run the application
|
| 32 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Karl Movie Vector Backend
|
| 3 |
+
emoji: π¬
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Karl Movie Vector Backend
|
| 12 |
+
|
| 13 |
+
FastAPI backend for semantic movie recommendations using FAISS and OpenAI embeddings. Powers intelligent movie discovery with geometric subspace algorithms.
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- Semantic movie search using OpenAI embeddings
|
| 18 |
+
- FAISS-powered vector similarity search
|
| 19 |
+
- Geometric subspace algorithms for multi-movie preferences
|
| 20 |
+
- ~150ms response time on CPU
|
| 21 |
+
- RESTful API with Bearer token authentication
|
| 22 |
+
|
| 23 |
+
## API Usage
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
curl -X POST "https://yonnel-karl-movie-vector-backend.hf.space/explore" \
|
| 27 |
+
-H "Authorization: Bearer YOUR_TOKEN" \
|
| 28 |
+
-H "Content-Type: application/json" \
|
| 29 |
+
-d '{
|
| 30 |
+
"liked_ids": [550, 680],
|
| 31 |
+
"disliked_ids": [],
|
| 32 |
+
"top_k": 100
|
| 33 |
+
}'
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Environment Variables
|
| 37 |
+
|
| 38 |
+
Set these in your Space settings:
|
| 39 |
+
- `OPENAI_API_KEY`: Your OpenAI API key
|
| 40 |
+
- `TMDB_API_KEY`: Your TMDB API key
|
| 41 |
+
- `API_TOKEN`: Authentication token for API access
|
| 42 |
+
- `ENV`: Set to "prod" for production
|
README_HF.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Karl Movie Vector Backend
|
| 3 |
+
emoji: π¬
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Karl Movie Vector Backend
|
| 12 |
+
|
| 13 |
+
FastAPI backend for semantic movie recommendations using FAISS and OpenAI embeddings. Powers intelligent movie discovery with geometric subspace algorithms.
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- Semantic movie search using OpenAI embeddings
|
| 18 |
+
- FAISS-powered vector similarity search
|
| 19 |
+
- Geometric subspace algorithms for multi-movie preferences
|
| 20 |
+
- ~150ms response time on CPU
|
| 21 |
+
- RESTful API with Bearer token authentication
|
| 22 |
+
|
| 23 |
+
## API Usage
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
curl -X POST "https://yonnel-karl-movie-vector-backend.hf.space/explore" \
|
| 27 |
+
-H "Authorization: Bearer YOUR_TOKEN" \
|
| 28 |
+
-H "Content-Type: application/json" \
|
| 29 |
+
-d '{
|
| 30 |
+
"liked_ids": [550, 680],
|
| 31 |
+
"disliked_ids": [],
|
| 32 |
+
"top_k": 100
|
| 33 |
+
}'
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
## Environment Variables
|
| 37 |
+
|
| 38 |
+
Set these in your Space settings:
|
| 39 |
+
- `OPENAI_API_KEY`: Your OpenAI API key
|
| 40 |
+
- `TMDB_API_KEY`: Your TMDB API key
|
| 41 |
+
- `API_TOKEN`: Authentication token for API access
|
| 42 |
+
- `ENV`: Set to "prod" for production
|
app/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Karl-Movie Vector Backend
|
| 3 |
+
A FastAPI service for semantic movie recommendations using FAISS and OpenAI embeddings
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
__version__ = "1.0.0"
|
app/build_index.py
ADDED
|
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Build FAISS index from movie embeddings
|
| 3 |
+
This script should be run once to create the data files needed by the API
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import numpy as np
|
| 8 |
+
import faiss
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
import requests
|
| 11 |
+
from typing import Dict, List, Optional
|
| 12 |
+
import time
|
| 13 |
+
import argparse
|
| 14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
+
import logging
|
| 16 |
+
from settings import get_settings
|
| 17 |
+
import pickle
|
| 18 |
+
|
| 19 |
+
# Configure logging
|
| 20 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
# Checkpoint file paths
|
| 24 |
+
CHECKPOINT_DIR = "app/data/checkpoints"
|
| 25 |
+
MOVIE_DATA_CHECKPOINT = f"{CHECKPOINT_DIR}/movie_data.pkl"
|
| 26 |
+
EMBEDDINGS_CHECKPOINT = f"{CHECKPOINT_DIR}/embeddings_progress.pkl"
|
| 27 |
+
METADATA_CHECKPOINT = f"{CHECKPOINT_DIR}/metadata_progress.pkl"
|
| 28 |
+
|
| 29 |
+
def save_checkpoint(data, filepath: str):
|
| 30 |
+
"""Save checkpoint data to file"""
|
| 31 |
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
| 32 |
+
with open(filepath, 'wb') as f:
|
| 33 |
+
pickle.dump(data, f)
|
| 34 |
+
logger.info(f"Checkpoint saved: {filepath}")
|
| 35 |
+
|
| 36 |
+
def load_checkpoint(filepath: str):
|
| 37 |
+
"""Load checkpoint data from file"""
|
| 38 |
+
if os.path.exists(filepath):
|
| 39 |
+
with open(filepath, 'rb') as f:
|
| 40 |
+
data = pickle.load(f)
|
| 41 |
+
logger.info(f"Checkpoint loaded: {filepath}")
|
| 42 |
+
return data
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
def cleanup_checkpoints():
|
| 46 |
+
"""Remove checkpoint files after successful completion"""
|
| 47 |
+
import shutil
|
| 48 |
+
if os.path.exists(CHECKPOINT_DIR):
|
| 49 |
+
shutil.rmtree(CHECKPOINT_DIR)
|
| 50 |
+
logger.info("Checkpoint files cleaned up")
|
| 51 |
+
|
| 52 |
+
class TMDBClient:
|
| 53 |
+
"""Client for TMDB API with retry and backoff"""
|
| 54 |
+
|
| 55 |
+
def __init__(self, api_key: str):
|
| 56 |
+
self.api_key = api_key
|
| 57 |
+
self.base_url = "https://api.themoviedb.org/3"
|
| 58 |
+
self.session = requests.Session()
|
| 59 |
+
|
| 60 |
+
def _make_request(self, endpoint: str, params: dict = None, max_retries: int = 3) -> Optional[dict]:
|
| 61 |
+
"""Make API request with retry and backoff"""
|
| 62 |
+
if params is None:
|
| 63 |
+
params = {}
|
| 64 |
+
params['api_key'] = self.api_key
|
| 65 |
+
|
| 66 |
+
url = f"{self.base_url}{endpoint}"
|
| 67 |
+
|
| 68 |
+
for attempt in range(max_retries):
|
| 69 |
+
try:
|
| 70 |
+
response = self.session.get(url, params=params, timeout=10)
|
| 71 |
+
|
| 72 |
+
if response.status_code == 200:
|
| 73 |
+
return response.json()
|
| 74 |
+
elif response.status_code == 429:
|
| 75 |
+
# Rate limit - wait and retry
|
| 76 |
+
wait_time = 2 ** attempt
|
| 77 |
+
logger.warning(f"Rate limited, waiting {wait_time}s before retry...")
|
| 78 |
+
time.sleep(wait_time)
|
| 79 |
+
continue
|
| 80 |
+
elif response.status_code == 404:
|
| 81 |
+
logger.warning(f"Resource not found: {url}")
|
| 82 |
+
return None
|
| 83 |
+
else:
|
| 84 |
+
logger.error(f"API error {response.status_code}: {response.text}")
|
| 85 |
+
|
| 86 |
+
except requests.exceptions.RequestException as e:
|
| 87 |
+
logger.error(f"Request failed (attempt {attempt + 1}): {e}")
|
| 88 |
+
if attempt < max_retries - 1:
|
| 89 |
+
time.sleep(2 ** attempt)
|
| 90 |
+
|
| 91 |
+
return None
|
| 92 |
+
|
| 93 |
+
def get_popular_movies(self, max_pages: int = 100) -> List[int]:
|
| 94 |
+
"""Get movie IDs from popular movies pagination"""
|
| 95 |
+
movie_ids = []
|
| 96 |
+
|
| 97 |
+
for page in range(1, max_pages + 1):
|
| 98 |
+
logger.info(f"Fetching popular movies page {page}/{max_pages}")
|
| 99 |
+
|
| 100 |
+
data = self._make_request("/movie/popular", {"page": page})
|
| 101 |
+
if not data:
|
| 102 |
+
logger.error(f"Failed to fetch page {page}")
|
| 103 |
+
break
|
| 104 |
+
|
| 105 |
+
# Check if we've exceeded total pages
|
| 106 |
+
if page > data.get('total_pages', 0):
|
| 107 |
+
logger.info(f"Reached last page ({data.get('total_pages')})")
|
| 108 |
+
break
|
| 109 |
+
|
| 110 |
+
# Extract movie IDs
|
| 111 |
+
for movie in data.get('results', []):
|
| 112 |
+
movie_ids.append(movie['id'])
|
| 113 |
+
|
| 114 |
+
# Rate limiting
|
| 115 |
+
time.sleep(0.25) # 4 requests per second max
|
| 116 |
+
|
| 117 |
+
logger.info(f"Collected {len(movie_ids)} movie IDs from {page} pages")
|
| 118 |
+
return movie_ids
|
| 119 |
+
|
| 120 |
+
def get_movie_details(self, movie_id: int) -> Optional[dict]:
|
| 121 |
+
"""Get detailed movie information"""
|
| 122 |
+
return self._make_request(f"/movie/{movie_id}")
|
| 123 |
+
|
| 124 |
+
def get_movie_credits(self, movie_id: int) -> Optional[dict]:
|
| 125 |
+
"""Get movie cast and crew"""
|
| 126 |
+
return self._make_request(f"/movie/{movie_id}/credits")
|
| 127 |
+
|
| 128 |
+
def fetch_movie_data(tmdb_client: TMDBClient, movie_ids: List[int], max_workers: int = 5) -> Dict[int, dict]:
|
| 129 |
+
"""Fetch detailed data for all movies with controlled parallelization"""
|
| 130 |
+
movies_data = {}
|
| 131 |
+
|
| 132 |
+
def fetch_single_movie(movie_id: int) -> tuple:
|
| 133 |
+
"""Fetch details and credits for a single movie"""
|
| 134 |
+
try:
|
| 135 |
+
# Get basic details
|
| 136 |
+
details = tmdb_client.get_movie_details(movie_id)
|
| 137 |
+
if not details:
|
| 138 |
+
return movie_id, None
|
| 139 |
+
|
| 140 |
+
# Get credits
|
| 141 |
+
credits = tmdb_client.get_movie_credits(movie_id)
|
| 142 |
+
if credits:
|
| 143 |
+
details['credits'] = credits
|
| 144 |
+
|
| 145 |
+
return movie_id, details
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.error(f"Error fetching movie {movie_id}: {e}")
|
| 149 |
+
return movie_id, None
|
| 150 |
+
|
| 151 |
+
# Process movies in batches with controlled parallelization
|
| 152 |
+
batch_size = 50
|
| 153 |
+
total_movies = len(movie_ids)
|
| 154 |
+
|
| 155 |
+
for i in range(0, total_movies, batch_size):
|
| 156 |
+
batch = movie_ids[i:i + batch_size]
|
| 157 |
+
logger.info(f"Processing batch {i//batch_size + 1}/{(total_movies-1)//batch_size + 1} ({len(batch)} movies)")
|
| 158 |
+
|
| 159 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 160 |
+
futures = {executor.submit(fetch_single_movie, movie_id): movie_id for movie_id in batch}
|
| 161 |
+
|
| 162 |
+
for future in as_completed(futures):
|
| 163 |
+
movie_id, movie_data = future.result()
|
| 164 |
+
if movie_data:
|
| 165 |
+
movies_data[movie_id] = movie_data
|
| 166 |
+
|
| 167 |
+
# Sleep between batches to be respectful to API
|
| 168 |
+
time.sleep(1)
|
| 169 |
+
|
| 170 |
+
logger.info(f"Successfully fetched data for {len(movies_data)}/{total_movies} movies")
|
| 171 |
+
return movies_data
|
| 172 |
+
|
| 173 |
+
def create_composite_text(movie_data: Dict) -> str:
|
| 174 |
+
"""Create composite text for embedding from movie data"""
|
| 175 |
+
parts = []
|
| 176 |
+
|
| 177 |
+
# Title
|
| 178 |
+
if movie_data.get('title'):
|
| 179 |
+
parts.append(f"Title: {movie_data['title']}")
|
| 180 |
+
|
| 181 |
+
# Tagline
|
| 182 |
+
if movie_data.get('tagline'):
|
| 183 |
+
parts.append(f"Tagline: {movie_data['tagline']}")
|
| 184 |
+
|
| 185 |
+
# Overview
|
| 186 |
+
if movie_data.get('overview'):
|
| 187 |
+
parts.append(f"Overview: {movie_data['overview']}")
|
| 188 |
+
|
| 189 |
+
# Release date
|
| 190 |
+
if movie_data.get('release_date'):
|
| 191 |
+
parts.append(f"Release Date: {movie_data['release_date']}")
|
| 192 |
+
|
| 193 |
+
# Original language
|
| 194 |
+
if movie_data.get('original_language'):
|
| 195 |
+
parts.append(f"Language: {movie_data['original_language']}")
|
| 196 |
+
|
| 197 |
+
# Spoken languages
|
| 198 |
+
if movie_data.get('spoken_languages'):
|
| 199 |
+
languages = [lang.get('iso_639_1', '') for lang in movie_data['spoken_languages'] if lang.get('iso_639_1')]
|
| 200 |
+
if languages:
|
| 201 |
+
parts.append(f"Spoken Languages: {', '.join(languages)}")
|
| 202 |
+
|
| 203 |
+
# Genres
|
| 204 |
+
if movie_data.get('genres'):
|
| 205 |
+
genres = [genre['name'] for genre in movie_data['genres']]
|
| 206 |
+
parts.append(f"Genres: {', '.join(genres)}")
|
| 207 |
+
|
| 208 |
+
# Production companies
|
| 209 |
+
if movie_data.get('production_companies'):
|
| 210 |
+
companies = [company['name'] for company in movie_data['production_companies']]
|
| 211 |
+
if companies:
|
| 212 |
+
parts.append(f"Production Companies: {', '.join(companies)}")
|
| 213 |
+
|
| 214 |
+
# Production countries
|
| 215 |
+
if movie_data.get('production_countries'):
|
| 216 |
+
countries = [country['name'] for country in movie_data['production_countries']]
|
| 217 |
+
if countries:
|
| 218 |
+
parts.append(f"Production Countries: {', '.join(countries)}")
|
| 219 |
+
|
| 220 |
+
# Budget (only if > 0)
|
| 221 |
+
if movie_data.get('budget') and movie_data['budget'] > 0:
|
| 222 |
+
parts.append(f"Budget: ${movie_data['budget']:,}")
|
| 223 |
+
|
| 224 |
+
# Popularity
|
| 225 |
+
if movie_data.get('popularity'):
|
| 226 |
+
parts.append(f"Popularity: {movie_data['popularity']}")
|
| 227 |
+
|
| 228 |
+
# Vote average
|
| 229 |
+
if movie_data.get('vote_average'):
|
| 230 |
+
parts.append(f"Vote Average: {movie_data['vote_average']}")
|
| 231 |
+
|
| 232 |
+
# Vote count
|
| 233 |
+
if movie_data.get('vote_count'):
|
| 234 |
+
parts.append(f"Vote Count: {movie_data['vote_count']}")
|
| 235 |
+
|
| 236 |
+
# Director(s)
|
| 237 |
+
if movie_data.get('credits', {}).get('crew'):
|
| 238 |
+
directors = [person['name'] for person in movie_data['credits']['crew'] if person['job'] == 'Director']
|
| 239 |
+
if directors:
|
| 240 |
+
parts.append(f"Director: {', '.join(directors)}")
|
| 241 |
+
|
| 242 |
+
# Top 5 cast
|
| 243 |
+
if movie_data.get('credits', {}).get('cast'):
|
| 244 |
+
top_cast = [person['name'] for person in movie_data['credits']['cast'][:5]]
|
| 245 |
+
if top_cast:
|
| 246 |
+
parts.append(f"Cast: {', '.join(top_cast)}")
|
| 247 |
+
|
| 248 |
+
return " / ".join(parts)
|
| 249 |
+
|
| 250 |
+
def get_embeddings_batch(texts: List[str], client: OpenAI, model: str = "text-embedding-3-small") -> List[List[float]]:
|
| 251 |
+
"""Get embeddings for a batch of texts with retry"""
|
| 252 |
+
max_retries = 3
|
| 253 |
+
|
| 254 |
+
for attempt in range(max_retries):
|
| 255 |
+
try:
|
| 256 |
+
response = client.embeddings.create(
|
| 257 |
+
input=texts,
|
| 258 |
+
model=model
|
| 259 |
+
)
|
| 260 |
+
return [item.embedding for item in response.data]
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.error(f"Error getting embeddings (attempt {attempt + 1}): {e}")
|
| 263 |
+
if attempt < max_retries - 1:
|
| 264 |
+
time.sleep(2 ** attempt)
|
| 265 |
+
else:
|
| 266 |
+
raise
|
| 267 |
+
|
| 268 |
+
def build_index(max_pages: int = 10, model: str = "text-embedding-3-small", use_faiss: bool = True):
|
| 269 |
+
"""Main function to build the FAISS index and data files"""
|
| 270 |
+
settings = get_settings()
|
| 271 |
+
|
| 272 |
+
# Initialize clients
|
| 273 |
+
tmdb_client = TMDBClient(settings.tmdb_api_key)
|
| 274 |
+
openai_client = OpenAI(api_key=settings.openai_api_key)
|
| 275 |
+
|
| 276 |
+
# Create data directory
|
| 277 |
+
os.makedirs("app/data", exist_ok=True)
|
| 278 |
+
|
| 279 |
+
# Check for existing movie data checkpoint
|
| 280 |
+
movies_data = load_checkpoint(MOVIE_DATA_CHECKPOINT)
|
| 281 |
+
|
| 282 |
+
if movies_data is not None:
|
| 283 |
+
logger.info(f"π Resuming from checkpoint: {len(movies_data)} movies data found")
|
| 284 |
+
else:
|
| 285 |
+
# Step 1: Get movie IDs
|
| 286 |
+
logger.info(f"Fetching movie IDs from TMDB (max {max_pages} pages)...")
|
| 287 |
+
movie_ids = tmdb_client.get_popular_movies(max_pages=max_pages)
|
| 288 |
+
|
| 289 |
+
if not movie_ids:
|
| 290 |
+
logger.error("β No movie IDs retrieved from TMDB")
|
| 291 |
+
return
|
| 292 |
+
|
| 293 |
+
# Step 2: Fetch detailed movie data
|
| 294 |
+
logger.info(f"Fetching detailed data for {len(movie_ids)} movies...")
|
| 295 |
+
movies_data = fetch_movie_data(tmdb_client, movie_ids)
|
| 296 |
+
|
| 297 |
+
if not movies_data:
|
| 298 |
+
logger.error("β No movie data retrieved")
|
| 299 |
+
return
|
| 300 |
+
|
| 301 |
+
# Save movie data checkpoint
|
| 302 |
+
save_checkpoint(movies_data, MOVIE_DATA_CHECKPOINT)
|
| 303 |
+
|
| 304 |
+
# Step 3: Create composite texts and process embeddings in batches
|
| 305 |
+
logger.info("Creating embeddings...")
|
| 306 |
+
embeddings = []
|
| 307 |
+
id_map = {}
|
| 308 |
+
movie_metadata = {}
|
| 309 |
+
processed_movie_ids = set()
|
| 310 |
+
|
| 311 |
+
batch_size = 20 # Process 20 texts at a time
|
| 312 |
+
|
| 313 |
+
# Check for existing embedding progress
|
| 314 |
+
embedding_checkpoint = load_checkpoint(EMBEDDINGS_CHECKPOINT)
|
| 315 |
+
metadata_checkpoint = load_checkpoint(METADATA_CHECKPOINT)
|
| 316 |
+
|
| 317 |
+
if embedding_checkpoint is not None and metadata_checkpoint is not None:
|
| 318 |
+
embeddings = embedding_checkpoint['embeddings']
|
| 319 |
+
id_map = embedding_checkpoint['id_map']
|
| 320 |
+
processed_movie_ids = set(embedding_checkpoint['processed_movie_ids'])
|
| 321 |
+
movie_metadata = metadata_checkpoint
|
| 322 |
+
logger.info(f"π Resuming embeddings from checkpoint: {len(embeddings)} embeddings found")
|
| 323 |
+
else:
|
| 324 |
+
logger.info("Starting embeddings from scratch")
|
| 325 |
+
|
| 326 |
+
# Process remaining movies
|
| 327 |
+
remaining_movies = {k: v for k, v in movies_data.items() if k not in processed_movie_ids}
|
| 328 |
+
logger.info(f"Processing {len(remaining_movies)} remaining movies")
|
| 329 |
+
|
| 330 |
+
composite_texts = []
|
| 331 |
+
current_movie_ids = []
|
| 332 |
+
|
| 333 |
+
for movie_id, movie_data in remaining_movies.items():
|
| 334 |
+
# Create composite text
|
| 335 |
+
composite_text = create_composite_text(movie_data)
|
| 336 |
+
composite_texts.append(composite_text)
|
| 337 |
+
current_movie_ids.append(movie_id)
|
| 338 |
+
|
| 339 |
+
# Store metadata
|
| 340 |
+
release_year = 0
|
| 341 |
+
if movie_data.get("release_date"):
|
| 342 |
+
try:
|
| 343 |
+
release_year = int(movie_data["release_date"][:4])
|
| 344 |
+
except (ValueError, IndexError):
|
| 345 |
+
release_year = 0
|
| 346 |
+
|
| 347 |
+
movie_metadata[str(movie_id)] = {
|
| 348 |
+
"id": movie_id,
|
| 349 |
+
"title": movie_data.get("title", ""),
|
| 350 |
+
"year": release_year,
|
| 351 |
+
"poster_path": movie_data.get("poster_path"),
|
| 352 |
+
"release_date": movie_data.get("release_date"),
|
| 353 |
+
"genres": [g["name"] for g in movie_data.get("genres", [])]
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
# Process batch when full
|
| 357 |
+
if len(composite_texts) >= batch_size:
|
| 358 |
+
logger.info(f"Processing embedding batch ({len(embeddings)} done, {len(composite_texts)} in batch)")
|
| 359 |
+
|
| 360 |
+
try:
|
| 361 |
+
batch_embeddings = get_embeddings_batch(composite_texts, openai_client, model)
|
| 362 |
+
embeddings.extend(batch_embeddings)
|
| 363 |
+
|
| 364 |
+
# Update ID mapping and processed set
|
| 365 |
+
for i, mid in enumerate(current_movie_ids):
|
| 366 |
+
id_map[str(mid)] = len(id_map)
|
| 367 |
+
processed_movie_ids.add(mid)
|
| 368 |
+
|
| 369 |
+
# Save progress checkpoints
|
| 370 |
+
embedding_data = {
|
| 371 |
+
'embeddings': embeddings,
|
| 372 |
+
'id_map': id_map,
|
| 373 |
+
'processed_movie_ids': list(processed_movie_ids)
|
| 374 |
+
}
|
| 375 |
+
save_checkpoint(embedding_data, EMBEDDINGS_CHECKPOINT)
|
| 376 |
+
save_checkpoint(movie_metadata, METADATA_CHECKPOINT)
|
| 377 |
+
|
| 378 |
+
# Clear batch
|
| 379 |
+
composite_texts = []
|
| 380 |
+
current_movie_ids = []
|
| 381 |
+
|
| 382 |
+
# Sleep between batches
|
| 383 |
+
time.sleep(0.5)
|
| 384 |
+
|
| 385 |
+
except Exception as e:
|
| 386 |
+
logger.error(f"Failed to process batch: {e}")
|
| 387 |
+
logger.info("Progress has been saved, you can restart the script to resume")
|
| 388 |
+
return
|
| 389 |
+
|
| 390 |
+
# Process remaining texts
|
| 391 |
+
if composite_texts:
|
| 392 |
+
logger.info(f"Processing final embedding batch ({len(composite_texts)} texts)")
|
| 393 |
+
try:
|
| 394 |
+
batch_embeddings = get_embeddings_batch(composite_texts, openai_client, model)
|
| 395 |
+
embeddings.extend(batch_embeddings)
|
| 396 |
+
|
| 397 |
+
for i, mid in enumerate(current_movie_ids):
|
| 398 |
+
id_map[str(mid)] = len(id_map)
|
| 399 |
+
processed_movie_ids.add(mid)
|
| 400 |
+
|
| 401 |
+
# Save final progress
|
| 402 |
+
embedding_data = {
|
| 403 |
+
'embeddings': embeddings,
|
| 404 |
+
'id_map': id_map,
|
| 405 |
+
'processed_movie_ids': list(processed_movie_ids)
|
| 406 |
+
}
|
| 407 |
+
save_checkpoint(embedding_data, EMBEDDINGS_CHECKPOINT)
|
| 408 |
+
save_checkpoint(movie_metadata, METADATA_CHECKPOINT)
|
| 409 |
+
|
| 410 |
+
except Exception as e:
|
| 411 |
+
logger.error(f"Failed to process final batch: {e}")
|
| 412 |
+
logger.info("Progress has been saved, you can restart the script to resume")
|
| 413 |
+
return
|
| 414 |
+
|
| 415 |
+
if not embeddings:
|
| 416 |
+
logger.error("β No embeddings generated")
|
| 417 |
+
return
|
| 418 |
+
|
| 419 |
+
logger.info(f"Generated {len(embeddings)} embeddings")
|
| 420 |
+
|
| 421 |
+
# Step 4: Save embeddings as numpy array
|
| 422 |
+
embeddings_array = np.array(embeddings, dtype=np.float32)
|
| 423 |
+
np.save("app/data/movies.npy", embeddings_array)
|
| 424 |
+
logger.info(f"Saved embeddings matrix: {embeddings_array.shape}")
|
| 425 |
+
|
| 426 |
+
# Step 5: Build and save FAISS index
|
| 427 |
+
if use_faiss:
|
| 428 |
+
logger.info("Building FAISS index...")
|
| 429 |
+
dimension = embeddings_array.shape[1]
|
| 430 |
+
|
| 431 |
+
# Choose index type based on size
|
| 432 |
+
if len(embeddings) < 10000:
|
| 433 |
+
# For smaller datasets, use flat index
|
| 434 |
+
index = faiss.IndexFlatL2(dimension)
|
| 435 |
+
else:
|
| 436 |
+
# For larger datasets, use IVF index
|
| 437 |
+
nlist = min(int(np.sqrt(len(embeddings))), 1000)
|
| 438 |
+
quantizer = faiss.IndexFlatL2(dimension)
|
| 439 |
+
index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
|
| 440 |
+
# Train the index
|
| 441 |
+
index.train(embeddings_array)
|
| 442 |
+
|
| 443 |
+
index.add(embeddings_array)
|
| 444 |
+
faiss.write_index(index, "app/data/faiss.index")
|
| 445 |
+
logger.info(f"FAISS index saved (type: {type(index).__name__}, dimension: {dimension})")
|
| 446 |
+
|
| 447 |
+
# Step 6: Save metadata files
|
| 448 |
+
with open("app/data/id_map.json", "w") as f:
|
| 449 |
+
json.dump(id_map, f)
|
| 450 |
+
|
| 451 |
+
with open("app/data/movie_metadata.json", "w") as f:
|
| 452 |
+
json.dump(movie_metadata, f)
|
| 453 |
+
|
| 454 |
+
logger.info("β
Index built successfully!")
|
| 455 |
+
logger.info(f" - {len(embeddings)} movies indexed")
|
| 456 |
+
logger.info(f" - Embedding model: {model}")
|
| 457 |
+
logger.info(f" - Files saved in app/data/")
|
| 458 |
+
logger.info(f" * movies.npy: embeddings matrix")
|
| 459 |
+
logger.info(f" * id_map.json: TMDB ID to matrix position mapping")
|
| 460 |
+
logger.info(f" * movie_metadata.json: movie metadata")
|
| 461 |
+
if use_faiss:
|
| 462 |
+
logger.info(f" * faiss.index: FAISS search index")
|
| 463 |
+
|
| 464 |
+
# Cleanup checkpoints
|
| 465 |
+
cleanup_checkpoints()
|
| 466 |
+
|
| 467 |
+
# Remove the old functions that are no longer needed
|
| 468 |
+
# create_movie_embedding and load_movie_data are replaced by the new implementation
|
| 469 |
+
|
| 470 |
+
if __name__ == "__main__":
|
| 471 |
+
parser = argparse.ArgumentParser(description="Build movie embeddings index from TMDB data")
|
| 472 |
+
parser.add_argument("--max-pages", type=int, default=10,
|
| 473 |
+
help="Maximum pages to fetch from TMDB popular movies (default: 10)")
|
| 474 |
+
parser.add_argument("--model", type=str, default="text-embedding-3-small",
|
| 475 |
+
help="OpenAI embedding model to use (default: text-embedding-3-small)")
|
| 476 |
+
parser.add_argument("--no-faiss", action="store_true",
|
| 477 |
+
help="Skip building FAISS index")
|
| 478 |
+
|
| 479 |
+
args = parser.parse_args()
|
| 480 |
+
|
| 481 |
+
build_index(
|
| 482 |
+
max_pages=args.max_pages,
|
| 483 |
+
model=args.model,
|
| 484 |
+
use_faiss=not args.no_faiss
|
| 485 |
+
)
|
app/main.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
+
import faiss
|
| 5 |
+
from fastapi import FastAPI, HTTPException, Depends, status
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
import logging
|
| 11 |
+
import time
|
| 12 |
+
|
| 13 |
+
# Configure logging
|
| 14 |
+
logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO").upper())
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
# Security
|
| 18 |
+
security = HTTPBearer()
|
| 19 |
+
|
| 20 |
+
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
| 21 |
+
expected_token = os.getenv("API_TOKEN")
|
| 22 |
+
if not expected_token:
|
| 23 |
+
raise HTTPException(status_code=500, detail="API token not configured")
|
| 24 |
+
if credentials.credentials != expected_token:
|
| 25 |
+
raise HTTPException(status_code=401, detail="Invalid token")
|
| 26 |
+
return credentials.credentials
|
| 27 |
+
|
| 28 |
+
# Pydantic models
|
| 29 |
+
class ExploreRequest(BaseModel):
|
| 30 |
+
liked_ids: List[int]
|
| 31 |
+
disliked_ids: List[int] = []
|
| 32 |
+
top_k: int = 400
|
| 33 |
+
|
| 34 |
+
class MovieResult(BaseModel):
|
| 35 |
+
id: int
|
| 36 |
+
title: str
|
| 37 |
+
year: int
|
| 38 |
+
poster_path: Optional[str]
|
| 39 |
+
genres: List[str]
|
| 40 |
+
coords: List[float]
|
| 41 |
+
|
| 42 |
+
class ExploreResponse(BaseModel):
|
| 43 |
+
movies: List[MovieResult]
|
| 44 |
+
bary: List[float]
|
| 45 |
+
center: List[float]
|
| 46 |
+
|
| 47 |
+
# Global variables for loaded data
|
| 48 |
+
vectors = None
|
| 49 |
+
id_map = None
|
| 50 |
+
faiss_index = None
|
| 51 |
+
movie_metadata = None
|
| 52 |
+
|
| 53 |
+
def load_data():
|
| 54 |
+
"""Load FAISS index, vectors, and metadata on startup"""
|
| 55 |
+
global vectors, id_map, faiss_index, movie_metadata
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
# Load vectors
|
| 59 |
+
vectors = np.load("app/data/movies.npy")
|
| 60 |
+
logger.info(f"Loaded {vectors.shape[0]} movie vectors of dimension {vectors.shape[1]}")
|
| 61 |
+
|
| 62 |
+
# Load ID mapping
|
| 63 |
+
with open("app/data/id_map.json", "r") as f:
|
| 64 |
+
id_map = json.load(f)
|
| 65 |
+
logger.info(f"Loaded ID mapping for {len(id_map)} movies")
|
| 66 |
+
|
| 67 |
+
# Load FAISS index
|
| 68 |
+
faiss_index = faiss.read_index("app/data/faiss.index")
|
| 69 |
+
logger.info(f"Loaded FAISS index with {faiss_index.ntotal} vectors")
|
| 70 |
+
|
| 71 |
+
# Load movie metadata
|
| 72 |
+
with open("app/data/movie_metadata.json", "r") as f:
|
| 73 |
+
movie_metadata = json.load(f)
|
| 74 |
+
logger.info(f"Loaded metadata for {len(movie_metadata)} movies")
|
| 75 |
+
|
| 76 |
+
except Exception as e:
|
| 77 |
+
logger.error(f"Failed to load data: {e}")
|
| 78 |
+
raise
|
| 79 |
+
|
| 80 |
+
def build_plane(likes: np.ndarray, dislikes: np.ndarray = None, dim: int = 2):
|
| 81 |
+
"""
|
| 82 |
+
Build user subspace from liked/disliked movies
|
| 83 |
+
Returns (axes, center) where axes is 2xD orthonormal matrix
|
| 84 |
+
"""
|
| 85 |
+
n_likes = likes.shape[0] if likes is not None else 0
|
| 86 |
+
d = vectors.shape[1]
|
| 87 |
+
|
| 88 |
+
# Compute composite vector: +liked - 0.5*disliked
|
| 89 |
+
if n_likes == 0:
|
| 90 |
+
# Cold start: use global average
|
| 91 |
+
center = vectors.mean(0)
|
| 92 |
+
# Create random orthonormal basis
|
| 93 |
+
axes = np.random.randn(dim, d)
|
| 94 |
+
axes[0] /= np.linalg.norm(axes[0])
|
| 95 |
+
for i in range(1, dim):
|
| 96 |
+
for j in range(i):
|
| 97 |
+
axes[i] -= np.dot(axes[i], axes[j]) * axes[j]
|
| 98 |
+
axes[i] /= np.linalg.norm(axes[i])
|
| 99 |
+
else:
|
| 100 |
+
# Compute composite from likes and dislikes
|
| 101 |
+
composite = likes.mean(0)
|
| 102 |
+
if dislikes is not None and dislikes.shape[0] > 0:
|
| 103 |
+
composite -= 0.5 * dislikes.mean(0)
|
| 104 |
+
|
| 105 |
+
if n_likes == 1:
|
| 106 |
+
# One like: use as center, random orthogonal axes
|
| 107 |
+
center = composite
|
| 108 |
+
axis1 = np.random.randn(d)
|
| 109 |
+
axis1 /= np.linalg.norm(axis1)
|
| 110 |
+
axis2 = np.random.randn(d)
|
| 111 |
+
axis2 -= np.dot(axis2, axis1) * axis1
|
| 112 |
+
axis2 /= np.linalg.norm(axis2)
|
| 113 |
+
axes = np.vstack([axis1, axis2])
|
| 114 |
+
elif n_likes == 2:
|
| 115 |
+
# Two likes: line between them
|
| 116 |
+
center = likes.mean(0)
|
| 117 |
+
axis1 = likes[1] - likes[0]
|
| 118 |
+
axis1 /= np.linalg.norm(axis1)
|
| 119 |
+
axis2 = np.random.randn(d)
|
| 120 |
+
axis2 -= np.dot(axis2, axis1) * axis1
|
| 121 |
+
axis2 /= np.linalg.norm(axis2)
|
| 122 |
+
axes = np.vstack([axis1, axis2])
|
| 123 |
+
else:
|
| 124 |
+
# 3+ likes: PCA plane
|
| 125 |
+
center = likes.mean(0)
|
| 126 |
+
likes_centered = likes - center
|
| 127 |
+
u, s, vt = np.linalg.svd(likes_centered, full_matrices=False)
|
| 128 |
+
axes = vt[:2] # First 2 principal components
|
| 129 |
+
|
| 130 |
+
return axes, center
|
| 131 |
+
|
| 132 |
+
def assign_spiral_coords(n_movies: int):
|
| 133 |
+
"""
|
| 134 |
+
Assign 2D grid coordinates in outward spiral pattern
|
| 135 |
+
Returns array of shape (n_movies, 2) with integer coordinates
|
| 136 |
+
"""
|
| 137 |
+
coords = np.zeros((n_movies, 2), dtype=int)
|
| 138 |
+
if n_movies == 0:
|
| 139 |
+
return coords
|
| 140 |
+
|
| 141 |
+
coords[0] = [0, 0] # Start at origin
|
| 142 |
+
|
| 143 |
+
if n_movies == 1:
|
| 144 |
+
return coords
|
| 145 |
+
|
| 146 |
+
# Spiral pattern: right, up, left, down, repeat with increasing distances
|
| 147 |
+
dx, dy = [1, 0, -1, 0], [0, 1, 0, -1]
|
| 148 |
+
direction = 0
|
| 149 |
+
steps = 1
|
| 150 |
+
x, y = 0, 0
|
| 151 |
+
idx = 1
|
| 152 |
+
|
| 153 |
+
while idx < n_movies:
|
| 154 |
+
for _ in range(2): # Each step count is used twice (except the first)
|
| 155 |
+
for _ in range(steps):
|
| 156 |
+
if idx >= n_movies:
|
| 157 |
+
break
|
| 158 |
+
x += dx[direction]
|
| 159 |
+
y += dy[direction]
|
| 160 |
+
coords[idx] = [x, y]
|
| 161 |
+
idx += 1
|
| 162 |
+
direction = (direction + 1) % 4
|
| 163 |
+
if idx >= n_movies:
|
| 164 |
+
break
|
| 165 |
+
steps += 1
|
| 166 |
+
|
| 167 |
+
return coords
|
| 168 |
+
|
| 169 |
+
def compute_barycenter(liked_indices: List[int], coords: np.ndarray):
|
| 170 |
+
"""Compute barycenter of liked movies in 2D grid"""
|
| 171 |
+
if not liked_indices:
|
| 172 |
+
return [0.0, 0.0]
|
| 173 |
+
|
| 174 |
+
liked_coords = coords[liked_indices]
|
| 175 |
+
bary = liked_coords.mean(0)
|
| 176 |
+
return bary.tolist()
|
| 177 |
+
|
| 178 |
+
# FastAPI app setup
|
| 179 |
+
app = FastAPI(title="Karl-Movie Vector Backend", version="1.0.0")
|
| 180 |
+
|
| 181 |
+
# CORS configuration
|
| 182 |
+
DEV_ORIGINS = [
|
| 183 |
+
"http://localhost:5173",
|
| 184 |
+
"http://127.0.0.1:5173",
|
| 185 |
+
"http://localhost:8888",
|
| 186 |
+
"https://*.bolt.run",
|
| 187 |
+
"https://*.stackblitz.io",
|
| 188 |
+
]
|
| 189 |
+
|
| 190 |
+
PROD_ORIGINS = ["https://karl.movie"]
|
| 191 |
+
|
| 192 |
+
origins = DEV_ORIGINS if os.getenv("ENV") != "prod" else PROD_ORIGINS
|
| 193 |
+
|
| 194 |
+
app.add_middleware(
|
| 195 |
+
CORSMiddleware,
|
| 196 |
+
allow_origins=origins,
|
| 197 |
+
allow_methods=["POST", "GET"],
|
| 198 |
+
allow_headers=["*"],
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
@app.on_event("startup")
|
| 202 |
+
async def startup_event():
|
| 203 |
+
"""Load data on startup"""
|
| 204 |
+
load_data()
|
| 205 |
+
|
| 206 |
+
@app.get("/health")
|
| 207 |
+
async def health_check():
|
| 208 |
+
"""Health check endpoint"""
|
| 209 |
+
return {"status": "healthy", "vectors_loaded": vectors is not None}
|
| 210 |
+
|
| 211 |
+
@app.post("/explore", response_model=ExploreResponse)
|
| 212 |
+
async def explore(
|
| 213 |
+
request: ExploreRequest,
|
| 214 |
+
token: str = Depends(verify_token)
|
| 215 |
+
):
|
| 216 |
+
"""
|
| 217 |
+
Main endpoint: find movies closest to user's preference subspace
|
| 218 |
+
"""
|
| 219 |
+
start_time = time.time()
|
| 220 |
+
|
| 221 |
+
try:
|
| 222 |
+
# Convert TMDB IDs to internal indices
|
| 223 |
+
liked_indices = []
|
| 224 |
+
disliked_indices = []
|
| 225 |
+
|
| 226 |
+
for tmdb_id in request.liked_ids:
|
| 227 |
+
if str(tmdb_id) in id_map:
|
| 228 |
+
liked_indices.append(id_map[str(tmdb_id)])
|
| 229 |
+
else:
|
| 230 |
+
logger.warning(f"TMDB ID {tmdb_id} not found in index")
|
| 231 |
+
|
| 232 |
+
for tmdb_id in request.disliked_ids:
|
| 233 |
+
if str(tmdb_id) in id_map:
|
| 234 |
+
disliked_indices.append(id_map[str(tmdb_id)])
|
| 235 |
+
else:
|
| 236 |
+
logger.warning(f"TMDB ID {tmdb_id} not found in index")
|
| 237 |
+
|
| 238 |
+
# Get embedding vectors
|
| 239 |
+
liked_vectors = vectors[liked_indices] if liked_indices else None
|
| 240 |
+
disliked_vectors = vectors[disliked_indices] if disliked_indices else None
|
| 241 |
+
|
| 242 |
+
# Build user subspace
|
| 243 |
+
axes, center = build_plane(liked_vectors, disliked_vectors)
|
| 244 |
+
|
| 245 |
+
# Project all vectors onto the 2D subspace
|
| 246 |
+
projections = np.dot(vectors - center, axes.T) # Shape: (N, 2)
|
| 247 |
+
|
| 248 |
+
# Reconstruct vectors in original space
|
| 249 |
+
reconstructed = np.dot(projections, axes) + center
|
| 250 |
+
|
| 251 |
+
# Compute distances to subspace (residuals)
|
| 252 |
+
residuals = np.linalg.norm(vectors - reconstructed, axis=1)
|
| 253 |
+
|
| 254 |
+
# Get top-k closest movies
|
| 255 |
+
top_k_indices = np.argpartition(residuals, min(request.top_k, len(residuals)))[:request.top_k]
|
| 256 |
+
top_k_indices = top_k_indices[np.argsort(residuals[top_k_indices])]
|
| 257 |
+
|
| 258 |
+
# Assign spiral coordinates
|
| 259 |
+
spiral_coords = assign_spiral_coords(len(top_k_indices))
|
| 260 |
+
|
| 261 |
+
# Compute barycenter of liked movies
|
| 262 |
+
liked_positions = [i for i, idx in enumerate(top_k_indices) if idx in liked_indices]
|
| 263 |
+
bary = compute_barycenter(liked_positions, spiral_coords)
|
| 264 |
+
|
| 265 |
+
# Translate grid so barycenter is at origin
|
| 266 |
+
spiral_coords = spiral_coords - np.array(bary)
|
| 267 |
+
|
| 268 |
+
# Build response
|
| 269 |
+
movies = []
|
| 270 |
+
reverse_id_map = {v: k for k, v in id_map.items()}
|
| 271 |
+
|
| 272 |
+
for i, movie_idx in enumerate(top_k_indices):
|
| 273 |
+
tmdb_id = int(reverse_id_map[movie_idx])
|
| 274 |
+
metadata = movie_metadata.get(str(tmdb_id), {})
|
| 275 |
+
|
| 276 |
+
movie = MovieResult(
|
| 277 |
+
id=tmdb_id,
|
| 278 |
+
title=metadata.get("title", f"Movie {tmdb_id}"),
|
| 279 |
+
year=metadata.get("year", 0),
|
| 280 |
+
poster_path=metadata.get("poster_path"),
|
| 281 |
+
genres=metadata.get("genres", []),
|
| 282 |
+
coords=spiral_coords[i].tolist()
|
| 283 |
+
)
|
| 284 |
+
movies.append(movie)
|
| 285 |
+
|
| 286 |
+
response = ExploreResponse(
|
| 287 |
+
movies=movies,
|
| 288 |
+
bary=[0.0, 0.0], # Always [0,0] since we translated
|
| 289 |
+
center=center.tolist()
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
elapsed = time.time() - start_time
|
| 293 |
+
logger.info(f"Explore request processed in {elapsed:.3f}s - {len(request.liked_ids)} likes, {len(request.disliked_ids)} dislikes, {len(movies)} results")
|
| 294 |
+
|
| 295 |
+
return response
|
| 296 |
+
|
| 297 |
+
except Exception as e:
|
| 298 |
+
logger.error(f"Error processing explore request: {e}")
|
| 299 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 300 |
+
|
| 301 |
+
if __name__ == "__main__":
|
| 302 |
+
import uvicorn
|
| 303 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
app/settings.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Settings and environment configuration
|
| 3 |
+
"""
|
| 4 |
+
import os
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
from pydantic_settings import BaseSettings
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Settings(BaseSettings):
|
| 10 |
+
"""Application settings"""
|
| 11 |
+
|
| 12 |
+
# OpenAI API key for embeddings
|
| 13 |
+
openai_api_key: str
|
| 14 |
+
|
| 15 |
+
# TMDB API key for movie data
|
| 16 |
+
tmdb_api_key: str
|
| 17 |
+
|
| 18 |
+
# API authentication token
|
| 19 |
+
api_token: str
|
| 20 |
+
|
| 21 |
+
# Environment (dev/prod)
|
| 22 |
+
env: str = "dev"
|
| 23 |
+
|
| 24 |
+
# Logging level
|
| 25 |
+
log_level: str = "INFO"
|
| 26 |
+
|
| 27 |
+
class Config:
|
| 28 |
+
env_file = ".env"
|
| 29 |
+
env_file_encoding = "utf-8"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@lru_cache()
|
| 33 |
+
def get_settings() -> Settings:
|
| 34 |
+
"""Get cached settings instance"""
|
| 35 |
+
return Settings()
|
app/test_api.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test l'API /explore avec des exemples de films
|
| 3 |
+
"""
|
| 4 |
+
import requests
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
# Configuration
|
| 8 |
+
API_URL = "http://localhost:8000"
|
| 9 |
+
API_TOKEN = "your-api-token" # Remplacez par votre token
|
| 10 |
+
|
| 11 |
+
def test_explore_endpoint():
|
| 12 |
+
"""Test l'endpoint /explore avec diffΓ©rents scΓ©narios"""
|
| 13 |
+
|
| 14 |
+
# Lire les mΓ©tadonnΓ©es pour avoir des IDs de test
|
| 15 |
+
with open("app/data/movie_metadata.json", "r") as f:
|
| 16 |
+
metadata = json.load(f)
|
| 17 |
+
|
| 18 |
+
# Prendre les premiers films comme exemples
|
| 19 |
+
movie_ids = list(metadata.keys())[:5]
|
| 20 |
+
print(f"Films de test disponibles : {[metadata[mid]['title'] for mid in movie_ids]}")
|
| 21 |
+
|
| 22 |
+
# Test 1: Recherche avec 1 film aimΓ©
|
| 23 |
+
print("\nπ¬ Test 1: Recherche avec 1 film aimΓ©")
|
| 24 |
+
test_request = {
|
| 25 |
+
"liked_ids": [int(movie_ids[0])],
|
| 26 |
+
"disliked_ids": [],
|
| 27 |
+
"top_k": 10
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
response = requests.post(
|
| 32 |
+
f"{API_URL}/explore",
|
| 33 |
+
json=test_request,
|
| 34 |
+
headers={"Authorization": f"Bearer {API_TOKEN}"}
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
if response.status_code == 200:
|
| 38 |
+
data = response.json()
|
| 39 |
+
print(f"β
TrouvΓ© {len(data['movies'])} films similaires")
|
| 40 |
+
print(f"Film aimΓ©: {metadata[movie_ids[0]]['title']}")
|
| 41 |
+
print("Films recommandΓ©s:")
|
| 42 |
+
for movie in data['movies'][:3]:
|
| 43 |
+
print(f" - {movie['title']} ({movie['year']}) - {movie['genres']}")
|
| 44 |
+
else:
|
| 45 |
+
print(f"β Erreur {response.status_code}: {response.text}")
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"β Erreur de connexion: {e}")
|
| 49 |
+
print("π‘ VΓ©rifiez que votre API_TOKEN est correct dans le .env")
|
| 50 |
+
|
| 51 |
+
# Test 2: Recherche avec 2 films aimΓ©s
|
| 52 |
+
print("\nπ¬ Test 2: Recherche avec 2 films aimΓ©s")
|
| 53 |
+
test_request = {
|
| 54 |
+
"liked_ids": [int(movie_ids[0]), int(movie_ids[1])],
|
| 55 |
+
"disliked_ids": [],
|
| 56 |
+
"top_k": 10
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
response = requests.post(
|
| 61 |
+
f"{API_URL}/explore",
|
| 62 |
+
json=test_request,
|
| 63 |
+
headers={"Authorization": f"Bearer {API_TOKEN}"}
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
if response.status_code == 200:
|
| 67 |
+
data = response.json()
|
| 68 |
+
print(f"β
TrouvΓ© {len(data['movies'])} films similaires")
|
| 69 |
+
print(f"Films aimΓ©s: {metadata[movie_ids[0]]['title']}, {metadata[movie_ids[1]]['title']}")
|
| 70 |
+
print("Barycenter:", data['bary'])
|
| 71 |
+
else:
|
| 72 |
+
print(f"β Erreur {response.status_code}: {response.text}")
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f"β Erreur: {e}")
|
| 76 |
+
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
print("π§ͺ Test de l'API /explore")
|
| 79 |
+
print("=" * 40)
|
| 80 |
+
test_explore_endpoint()
|
app/test_setup.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test script for TMDB data loading and embedding generation
|
| 3 |
+
Run this to validate your setup before building the full index
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import json
|
| 8 |
+
from settings import get_settings
|
| 9 |
+
from build_index import TMDBClient, create_composite_text, get_embeddings_batch
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
|
| 12 |
+
def test_tmdb_connection():
|
| 13 |
+
"""Test TMDB API connection"""
|
| 14 |
+
print("π Testing TMDB API connection...")
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
settings = get_settings()
|
| 18 |
+
tmdb_client = TMDBClient(settings.tmdb_api_key)
|
| 19 |
+
|
| 20 |
+
# Test getting popular movies (just first page)
|
| 21 |
+
movie_ids = tmdb_client.get_popular_movies(max_pages=1)
|
| 22 |
+
|
| 23 |
+
if movie_ids:
|
| 24 |
+
print(f"β
Successfully fetched {len(movie_ids)} movie IDs from TMDB")
|
| 25 |
+
|
| 26 |
+
# Test getting details for first movie
|
| 27 |
+
movie_data = tmdb_client.get_movie_details(movie_ids[0])
|
| 28 |
+
if movie_data:
|
| 29 |
+
print(f"β
Successfully fetched details for movie: {movie_data.get('title', 'Unknown')}")
|
| 30 |
+
|
| 31 |
+
# Test getting credits
|
| 32 |
+
credits = tmdb_client.get_movie_credits(movie_ids[0])
|
| 33 |
+
if credits:
|
| 34 |
+
print(f"β
Successfully fetched credits (cast: {len(credits.get('cast', []))}, crew: {len(credits.get('crew', []))})")
|
| 35 |
+
else:
|
| 36 |
+
print("β οΈ Could not fetch credits")
|
| 37 |
+
|
| 38 |
+
return movie_data, credits
|
| 39 |
+
else:
|
| 40 |
+
print("β Could not fetch movie details")
|
| 41 |
+
else:
|
| 42 |
+
print("β Could not fetch movie IDs")
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
print(f"β TMDB API error: {e}")
|
| 46 |
+
|
| 47 |
+
return None, None
|
| 48 |
+
|
| 49 |
+
def test_composite_text(movie_data, credits):
|
| 50 |
+
"""Test composite text creation"""
|
| 51 |
+
print("\nπ Testing composite text creation...")
|
| 52 |
+
|
| 53 |
+
if movie_data:
|
| 54 |
+
# Add credits to movie data
|
| 55 |
+
if credits:
|
| 56 |
+
movie_data['credits'] = credits
|
| 57 |
+
|
| 58 |
+
composite_text = create_composite_text(movie_data)
|
| 59 |
+
print(f"β
Generated composite text ({len(composite_text)} chars)")
|
| 60 |
+
print(f"Preview: {composite_text[:200]}...")
|
| 61 |
+
return composite_text
|
| 62 |
+
else:
|
| 63 |
+
print("β No movie data to test")
|
| 64 |
+
return None
|
| 65 |
+
|
| 66 |
+
def test_embeddings(composite_text):
|
| 67 |
+
"""Test embedding generation"""
|
| 68 |
+
print("\nπ€ Testing embedding generation...")
|
| 69 |
+
|
| 70 |
+
if composite_text:
|
| 71 |
+
try:
|
| 72 |
+
settings = get_settings()
|
| 73 |
+
openai_client = OpenAI(api_key=settings.openai_api_key)
|
| 74 |
+
|
| 75 |
+
embeddings = get_embeddings_batch([composite_text], openai_client)
|
| 76 |
+
|
| 77 |
+
if embeddings:
|
| 78 |
+
embedding = embeddings[0]
|
| 79 |
+
print(f"β
Generated embedding (dimension: {len(embedding)})")
|
| 80 |
+
print(f"Sample values: {embedding[:5]}...")
|
| 81 |
+
return embedding
|
| 82 |
+
else:
|
| 83 |
+
print("β No embeddings generated")
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"β Embedding error: {e}")
|
| 87 |
+
else:
|
| 88 |
+
print("β No composite text to test")
|
| 89 |
+
|
| 90 |
+
return None
|
| 91 |
+
|
| 92 |
+
def main():
|
| 93 |
+
"""Run all tests"""
|
| 94 |
+
print("π¬ Karl Movie Vector Backend - Test Suite")
|
| 95 |
+
print("=" * 50)
|
| 96 |
+
|
| 97 |
+
# Test environment variables
|
| 98 |
+
print("π§ Checking environment variables...")
|
| 99 |
+
try:
|
| 100 |
+
settings = get_settings()
|
| 101 |
+
print(f"β
OpenAI API key: {'sk-...' + settings.openai_api_key[-10:] if settings.openai_api_key else 'Not set'}")
|
| 102 |
+
print(f"β
TMDB API key: {'...' + settings.tmdb_api_key[-10:] if settings.tmdb_api_key else 'Not set'}")
|
| 103 |
+
except Exception as e:
|
| 104 |
+
print(f"β Settings error: {e}")
|
| 105 |
+
print("Make sure you have a .env file with OPENAI_API_KEY and TMDB_API_KEY")
|
| 106 |
+
return
|
| 107 |
+
|
| 108 |
+
# Run tests
|
| 109 |
+
movie_data, credits = test_tmdb_connection()
|
| 110 |
+
composite_text = test_composite_text(movie_data, credits)
|
| 111 |
+
embedding = test_embeddings(composite_text)
|
| 112 |
+
|
| 113 |
+
print("\n" + "=" * 50)
|
| 114 |
+
if movie_data and composite_text and embedding:
|
| 115 |
+
print("π All tests passed! You can now run the full build:")
|
| 116 |
+
print(" python app/build_index.py --max-pages 3")
|
| 117 |
+
else:
|
| 118 |
+
print("β Some tests failed. Check your API keys and internet connection.")
|
| 119 |
+
|
| 120 |
+
if __name__ == "__main__":
|
| 121 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.104.1
|
| 2 |
+
uvicorn[standard]==0.24.0
|
| 3 |
+
numpy==1.24.4
|
| 4 |
+
faiss-cpu==1.7.4
|
| 5 |
+
openai==1.51.0
|
| 6 |
+
pydantic==2.11.5
|
| 7 |
+
pydantic-settings==2.9.1
|
| 8 |
+
python-multipart==0.0.6
|
| 9 |
+
requests==2.31.0
|
| 10 |
+
scikit-learn==1.3.2
|
| 11 |
+
python-dotenv==1.0.0
|
| 12 |
+
httpx==0.27.0
|