Spaces:
Sleeping
Sleeping
Commit ·
7d87fe9
0
Parent(s):
1st commit
Browse files- .gitattributes +35 -0
- .gitignore +22 -0
- Dockerfile +46 -0
- README.md +123 -0
- models/best_model.joblib +3 -0
- pyproject.toml +50 -0
- src/__init__.py +0 -0
- src/api/__init__.py +0 -0
- src/api/main.py +106 -0
- src/api/routers/__init__.py +0 -0
- src/api/routers/predict.py +49 -0
- src/api/schemas/__init__.py +0 -0
- src/api/schemas/prediction.py +57 -0
- src/config/__init__.py +0 -0
- src/config/settings.py +119 -0
- src/dashboard/__init__.py +0 -0
- src/dashboard/app.py +765 -0
- src/ml/__init__.py +0 -0
- src/ml/predict.py +95 -0
- src/ml/preprocessing.py +134 -0
- src/ml/train.py +145 -0
- uv.lock +0 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*.egg-info/
|
| 5 |
+
.venv/
|
| 6 |
+
|
| 7 |
+
# Environment
|
| 8 |
+
.env
|
| 9 |
+
|
| 10 |
+
# IDE
|
| 11 |
+
.vscode/
|
| 12 |
+
.idea/
|
| 13 |
+
.DS_Store
|
| 14 |
+
|
| 15 |
+
# Data
|
| 16 |
+
data/
|
| 17 |
+
|
| 18 |
+
# Claude Code
|
| 19 |
+
.claude/
|
| 20 |
+
|
| 21 |
+
# Jupyter
|
| 22 |
+
.ipynb_checkpoints/
|
Dockerfile
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Build stage
|
| 2 |
+
FROM python:3.11-slim AS builder
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install uv
|
| 7 |
+
RUN pip install --no-cache-dir uv
|
| 8 |
+
|
| 9 |
+
# Copy dependency files
|
| 10 |
+
COPY pyproject.toml uv.lock ./
|
| 11 |
+
|
| 12 |
+
# Install production dependencies only (no dev group = no mlflow)
|
| 13 |
+
RUN uv sync --no-group dev --frozen
|
| 14 |
+
|
| 15 |
+
# Production stage
|
| 16 |
+
FROM python:3.11-slim AS production
|
| 17 |
+
|
| 18 |
+
WORKDIR /app
|
| 19 |
+
|
| 20 |
+
# Create non-root user (HF Spaces requirement - uid 1000)
|
| 21 |
+
RUN useradd -m -u 1000 user
|
| 22 |
+
USER user
|
| 23 |
+
|
| 24 |
+
ENV HOME=/home/user \
|
| 25 |
+
PATH="/home/user/.local/bin:/app/.venv/bin:$PATH" \
|
| 26 |
+
PYTHONPATH="/app" \
|
| 27 |
+
ENVIRONMENT=production \
|
| 28 |
+
LOG_LEVEL=INFO
|
| 29 |
+
|
| 30 |
+
# Copy virtual environment from builder
|
| 31 |
+
COPY --from=builder --chown=user /app/.venv /app/.venv
|
| 32 |
+
|
| 33 |
+
# Copy application code (only API-relevant modules)
|
| 34 |
+
COPY --chown=user src/__init__.py ./src/__init__.py
|
| 35 |
+
COPY --chown=user src/api/ ./src/api/
|
| 36 |
+
COPY --chown=user src/ml/ ./src/ml/
|
| 37 |
+
COPY --chown=user src/config/ ./src/config/
|
| 38 |
+
|
| 39 |
+
# Copy trained model
|
| 40 |
+
COPY --chown=user models/ ./models/
|
| 41 |
+
|
| 42 |
+
# HF Spaces default port
|
| 43 |
+
EXPOSE 7860
|
| 44 |
+
|
| 45 |
+
# Run the API
|
| 46 |
+
CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Getaround Pricing API
|
| 3 |
+
colorFrom: blue
|
| 4 |
+
colorTo: green
|
| 5 |
+
sdk: docker
|
| 6 |
+
app_port: 7860
|
| 7 |
+
pinned: false
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# Getaround Pricing API
|
| 11 |
+
|
| 12 |
+
> ML-powered FastAPI service that predicts optimal daily rental prices for cars on the Getaround platform. Takes 13 car features as input, returns a predicted price in EUR.
|
| 13 |
+
|
| 14 |
+
## Demo
|
| 15 |
+
|
| 16 |
+
Live API: [https://sam-bot-getaround-api.hf.space](https://sam-bot-getaround-api.hf.space)
|
| 17 |
+
|
| 18 |
+
Swagger UI: [https://sam-bot-getaround-api.hf.space/docs](https://sam-bot-getaround-api.hf.space/docs)
|
| 19 |
+
|
| 20 |
+
## Key Results
|
| 21 |
+
|
| 22 |
+
**Endpoints:**
|
| 23 |
+
|
| 24 |
+
| Method | Path | Description |
|
| 25 |
+
|--------|------|-------------|
|
| 26 |
+
| POST | `/predict` | Predict rental prices for 1-50 cars |
|
| 27 |
+
| GET | `/health` | Health check with model status |
|
| 28 |
+
| GET | `/docs` | Swagger UI documentation |
|
| 29 |
+
|
| 30 |
+
**Model selection:** Three models trained (LinearRegression, RandomForestRegressor, GradientBoostingRegressor) with MLflow tracking. Best model selected by lowest RMSE (RandomForest: RMSE = 16.75, R² = 0.734), saved as a scikit-learn Pipeline (preprocessor + model).
|
| 31 |
+
|
| 32 |
+
## Tech Stack
|
| 33 |
+
|
| 34 |
+
| Category | Technology |
|
| 35 |
+
|----------|------------|
|
| 36 |
+
| Web Framework | FastAPI, Uvicorn |
|
| 37 |
+
| ML | scikit-learn, MLflow (training) |
|
| 38 |
+
| Data | pandas, numpy |
|
| 39 |
+
| Validation | Pydantic |
|
| 40 |
+
| Serialization | joblib, Git LFS (model storage) |
|
| 41 |
+
| Deployment | Docker |
|
| 42 |
+
| Package Manager | uv |
|
| 43 |
+
|
| 44 |
+
## Installation
|
| 45 |
+
|
| 46 |
+
The model file (`models/best_model.joblib`, ~32MB) is tracked with Git LFS. Pull it before running:
|
| 47 |
+
|
| 48 |
+
```bash
|
| 49 |
+
git clone <repo-url>
|
| 50 |
+
cd getarround_api
|
| 51 |
+
git lfs install
|
| 52 |
+
git lfs pull
|
| 53 |
+
uv sync
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## Usage
|
| 57 |
+
|
| 58 |
+
**Run locally:**
|
| 59 |
+
|
| 60 |
+
```bash
|
| 61 |
+
uv run uvicorn src.api.main:app --host 0.0.0.0 --port 8000 --reload
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
**Run with Docker:**
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
docker build -t getarround-api .
|
| 68 |
+
docker run -p 7860:7860 getarround-api
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
**Example request:**
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
curl -X POST http://localhost:8000/predict \
|
| 75 |
+
-H "Content-Type: application/json" \
|
| 76 |
+
-d '{
|
| 77 |
+
"cars": [{
|
| 78 |
+
"model_key": "Peugeot",
|
| 79 |
+
"mileage": 50000,
|
| 80 |
+
"engine_power": 120,
|
| 81 |
+
"fuel": "diesel",
|
| 82 |
+
"paint_color": "black",
|
| 83 |
+
"car_type": "sedan",
|
| 84 |
+
"private_parking_available": true,
|
| 85 |
+
"has_gps": true,
|
| 86 |
+
"has_air_conditioning": true,
|
| 87 |
+
"automatic_car": false,
|
| 88 |
+
"has_getaround_connect": false,
|
| 89 |
+
"has_speed_regulator": true,
|
| 90 |
+
"winter_tires": false
|
| 91 |
+
}]
|
| 92 |
+
}'
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
**Example response:**
|
| 96 |
+
|
| 97 |
+
```json
|
| 98 |
+
{
|
| 99 |
+
"prediction": [124]
|
| 100 |
+
}
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
## Data
|
| 104 |
+
|
| 105 |
+
**Model file:** `models/best_model.joblib` (~32MB, tracked via Git LFS). Contains a full scikit-learn Pipeline (preprocessing + trained estimator).
|
| 106 |
+
|
| 107 |
+
**Input features (13):**
|
| 108 |
+
|
| 109 |
+
| Feature | Type | Description |
|
| 110 |
+
|---------|------|-------------|
|
| 111 |
+
| `model_key` | string | Car brand (e.g., "Peugeot", "BMW", "Citroen") |
|
| 112 |
+
| `mileage` | int | Mileage in km |
|
| 113 |
+
| `engine_power` | int | Engine power in HP |
|
| 114 |
+
| `fuel` | string | Fuel type ("diesel", "petrol", "hybrid_petrol", "electro") |
|
| 115 |
+
| `paint_color` | string | Car color |
|
| 116 |
+
| `car_type` | string | Car type ("sedan", "hatchback", "suv", "van", etc.) |
|
| 117 |
+
| `private_parking_available` | bool | Has private parking |
|
| 118 |
+
| `has_gps` | bool | Has GPS |
|
| 119 |
+
| `has_air_conditioning` | bool | Has air conditioning |
|
| 120 |
+
| `automatic_car` | bool | Automatic transmission |
|
| 121 |
+
| `has_getaround_connect` | bool | Has Getaround Connect |
|
| 122 |
+
| `has_speed_regulator` | bool | Has cruise control |
|
| 123 |
+
| `winter_tires` | bool | Has winter tires |
|
models/best_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3645ccff2cb16af002a9fc6634c17bd517f463e261f30050ef62af3b58204d4
|
| 3 |
+
size 32840695
|
pyproject.toml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "getarround"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "Getaround delay analysis dashboard and pricing optimization API"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11,<3.12"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"fastapi~=0.115.0",
|
| 9 |
+
"uvicorn[standard]~=0.30.0",
|
| 10 |
+
"streamlit~=1.40.0",
|
| 11 |
+
"scikit-learn~=1.5.0",
|
| 12 |
+
"pandas~=2.2.0",
|
| 13 |
+
"openpyxl~=3.1.0",
|
| 14 |
+
"pydantic~=2.10.0",
|
| 15 |
+
"pydantic-settings~=2.10.0",
|
| 16 |
+
"numpy~=1.26.0",
|
| 17 |
+
"plotly~=5.24.0",
|
| 18 |
+
"httpx~=0.27.0",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
[dependency-groups]
|
| 22 |
+
dev = [
|
| 23 |
+
"ruff",
|
| 24 |
+
"pytest",
|
| 25 |
+
"pytest-cov",
|
| 26 |
+
"pre-commit",
|
| 27 |
+
"jupyter",
|
| 28 |
+
"ipykernel",
|
| 29 |
+
"factory-boy>=3.3.3",
|
| 30 |
+
"mlflow~=2.19.0",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
[tool.ruff]
|
| 34 |
+
line-length = 88
|
| 35 |
+
target-version = "py311"
|
| 36 |
+
|
| 37 |
+
[tool.ruff.lint]
|
| 38 |
+
select = ["E", "F", "I", "W"]
|
| 39 |
+
|
| 40 |
+
[tool.ruff.format]
|
| 41 |
+
quote-style = "double"
|
| 42 |
+
indent-style = "space"
|
| 43 |
+
|
| 44 |
+
[tool.pytest.ini_options]
|
| 45 |
+
testpaths = ["tests"]
|
| 46 |
+
python_files = ["test_*.py"]
|
| 47 |
+
addopts = "-v"
|
| 48 |
+
markers = [
|
| 49 |
+
"integration: integration tests requiring model file and data",
|
| 50 |
+
]
|
src/__init__.py
ADDED
|
File without changes
|
src/api/__init__.py
ADDED
|
File without changes
|
src/api/main.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application for Getaround pricing API."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from contextlib import asynccontextmanager
|
| 5 |
+
|
| 6 |
+
from fastapi import FastAPI
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
|
| 9 |
+
from src.api.routers.predict import router as predict_router
|
| 10 |
+
from src.config.settings import configure_logging, get_settings
|
| 11 |
+
from src.ml.predict import get_predictor
|
| 12 |
+
|
| 13 |
+
settings = get_settings()
|
| 14 |
+
configure_logging(settings)
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@asynccontextmanager
|
| 20 |
+
async def lifespan(app: FastAPI):
|
| 21 |
+
"""Manage application startup and shutdown.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
app: The FastAPI application instance.
|
| 25 |
+
"""
|
| 26 |
+
# Startup: preload model
|
| 27 |
+
logger.info("Starting up - preloading model")
|
| 28 |
+
try:
|
| 29 |
+
get_predictor()
|
| 30 |
+
logger.info("Model loaded successfully")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.warning("Model not available at startup: %s", e)
|
| 33 |
+
yield
|
| 34 |
+
# Shutdown
|
| 35 |
+
logger.info("Shutting down")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
app = FastAPI(
|
| 39 |
+
title="Getaround Pricing API",
|
| 40 |
+
description="""
|
| 41 |
+
API for predicting optimal rental prices for cars.
|
| 42 |
+
|
| 43 |
+
## Endpoints
|
| 44 |
+
|
| 45 |
+
- **POST /predict**: Predict rental prices based on car features
|
| 46 |
+
- **GET /health**: Health check endpoint
|
| 47 |
+
|
| 48 |
+
## Usage
|
| 49 |
+
|
| 50 |
+
Send a POST request to `/predict` with car features:
|
| 51 |
+
|
| 52 |
+
```json
|
| 53 |
+
{
|
| 54 |
+
"cars": [{
|
| 55 |
+
"model_key": "Citroen",
|
| 56 |
+
"mileage": 100000,
|
| 57 |
+
"engine_power": 120,
|
| 58 |
+
"fuel": "diesel",
|
| 59 |
+
"paint_color": "black",
|
| 60 |
+
"car_type": "sedan",
|
| 61 |
+
"private_parking_available": true,
|
| 62 |
+
"has_gps": true,
|
| 63 |
+
"has_air_conditioning": true,
|
| 64 |
+
"automatic_car": false,
|
| 65 |
+
"has_getaround_connect": false,
|
| 66 |
+
"has_speed_regulator": true,
|
| 67 |
+
"winter_tires": false
|
| 68 |
+
}]
|
| 69 |
+
}
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
Response:
|
| 73 |
+
```json
|
| 74 |
+
{
|
| 75 |
+
"prediction": [124]
|
| 76 |
+
}
|
| 77 |
+
```
|
| 78 |
+
""",
|
| 79 |
+
version="1.0.0",
|
| 80 |
+
docs_url="/docs",
|
| 81 |
+
redoc_url="/redoc",
|
| 82 |
+
lifespan=lifespan,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
app.add_middleware(
|
| 86 |
+
CORSMiddleware,
|
| 87 |
+
allow_origins=["*"],
|
| 88 |
+
allow_credentials=True,
|
| 89 |
+
allow_methods=["*"],
|
| 90 |
+
allow_headers=["*"],
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
app.include_router(predict_router)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@app.get("/health", tags=["health"])
|
| 97 |
+
async def health_check() -> dict:
|
| 98 |
+
"""Health check endpoint.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Status dictionary with model availability.
|
| 102 |
+
"""
|
| 103 |
+
from src.ml.predict import _predictor_instance
|
| 104 |
+
|
| 105 |
+
model_loaded = _predictor_instance is not None
|
| 106 |
+
return {"status": "healthy", "model_loaded": model_loaded}
|
src/api/routers/__init__.py
ADDED
|
File without changes
|
src/api/routers/predict.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prediction router for pricing API."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
from fastapi import APIRouter, HTTPException
|
| 6 |
+
|
| 7 |
+
from src.api.schemas.prediction import PredictionInput, PredictionOutput
|
| 8 |
+
from src.ml.predict import ModelNotFoundError, get_predictor
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
router = APIRouter(tags=["prediction"])
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
@router.post(
|
| 16 |
+
"/predict",
|
| 17 |
+
response_model=PredictionOutput,
|
| 18 |
+
summary="Predict rental prices",
|
| 19 |
+
description="Predict optimal rental prices for cars based on their features.",
|
| 20 |
+
)
|
| 21 |
+
async def predict(data: PredictionInput) -> PredictionOutput:
|
| 22 |
+
"""Predict rental prices from car features.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
data: Input containing list of car features.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Predictions with list of predicted prices in EUR.
|
| 29 |
+
|
| 30 |
+
Raises:
|
| 31 |
+
HTTPException: If model is not available or prediction fails.
|
| 32 |
+
"""
|
| 33 |
+
logger.info("Received prediction request for %d cars", len(data.cars))
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
predictor = get_predictor()
|
| 37 |
+
cars_dict = [car.model_dump() for car in data.cars]
|
| 38 |
+
predictions = predictor.predict_from_features(cars_dict)
|
| 39 |
+
logger.info("Predictions completed: %s", predictions)
|
| 40 |
+
return PredictionOutput(prediction=predictions)
|
| 41 |
+
except ModelNotFoundError as e:
|
| 42 |
+
logger.error("Model not found: %s", e)
|
| 43 |
+
raise HTTPException(status_code=503, detail="Model not available") from e
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error("Prediction failed: %s", e)
|
| 46 |
+
raise HTTPException(
|
| 47 |
+
status_code=500,
|
| 48 |
+
detail="Internal prediction error",
|
| 49 |
+
) from e
|
src/api/schemas/__init__.py
ADDED
|
File without changes
|
src/api/schemas/prediction.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for prediction API."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class CarFeatures(BaseModel):
|
| 7 |
+
"""Features for a single car rental price prediction.
|
| 8 |
+
|
| 9 |
+
Attributes:
|
| 10 |
+
model_key: Car brand/model (e.g., "Citroen", "Peugeot", "BMW").
|
| 11 |
+
mileage: Car mileage in kilometers.
|
| 12 |
+
engine_power: Engine power in horsepower.
|
| 13 |
+
fuel: Fuel type ("diesel", "petrol", "hybrid_petrol", "electro").
|
| 14 |
+
paint_color: Car color.
|
| 15 |
+
car_type: Type of car ("sedan", "hatchback", "suv", "van", etc.).
|
| 16 |
+
private_parking_available: Has private parking.
|
| 17 |
+
has_gps: Has GPS.
|
| 18 |
+
has_air_conditioning: Has air conditioning.
|
| 19 |
+
automatic_car: Is automatic transmission.
|
| 20 |
+
has_getaround_connect: Has Getaround Connect feature.
|
| 21 |
+
has_speed_regulator: Has speed regulator/cruise control.
|
| 22 |
+
winter_tires: Has winter tires.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
model_key: str = Field(..., examples=["Citroen"])
|
| 26 |
+
mileage: int = Field(..., ge=0, examples=[100000])
|
| 27 |
+
engine_power: int = Field(..., ge=0, examples=[120])
|
| 28 |
+
fuel: str = Field(..., examples=["diesel"])
|
| 29 |
+
paint_color: str = Field(..., examples=["black"])
|
| 30 |
+
car_type: str = Field(..., examples=["sedan"])
|
| 31 |
+
private_parking_available: bool = Field(default=False)
|
| 32 |
+
has_gps: bool = Field(default=False)
|
| 33 |
+
has_air_conditioning: bool = Field(default=False)
|
| 34 |
+
automatic_car: bool = Field(default=False)
|
| 35 |
+
has_getaround_connect: bool = Field(default=False)
|
| 36 |
+
has_speed_regulator: bool = Field(default=False)
|
| 37 |
+
winter_tires: bool = Field(default=False)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class PredictionInput(BaseModel):
|
| 41 |
+
"""Input schema for /predict endpoint."""
|
| 42 |
+
|
| 43 |
+
cars: list[CarFeatures] = Field(
|
| 44 |
+
...,
|
| 45 |
+
description="List of cars to predict prices for",
|
| 46 |
+
min_length=1,
|
| 47 |
+
max_length=50,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class PredictionOutput(BaseModel):
|
| 52 |
+
"""Output schema for /predict endpoint."""
|
| 53 |
+
|
| 54 |
+
prediction: list[int] = Field(
|
| 55 |
+
...,
|
| 56 |
+
description="List of predicted rental prices per day in EUR",
|
| 57 |
+
)
|
src/config/__init__.py
ADDED
|
File without changes
|
src/config/settings.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application settings using Pydantic Settings."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Environment(str, Enum):
|
| 12 |
+
"""Application environment."""
|
| 13 |
+
|
| 14 |
+
DEVELOPMENT = "development"
|
| 15 |
+
TESTING = "testing"
|
| 16 |
+
PRODUCTION = "production"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class Settings(BaseSettings):
|
| 20 |
+
"""Application settings loaded from environment variables.
|
| 21 |
+
|
| 22 |
+
Settings are loaded from .env files and environment variables.
|
| 23 |
+
Environment variables take precedence over .env file values.
|
| 24 |
+
|
| 25 |
+
Attributes:
|
| 26 |
+
environment: Current environment (development, testing, production).
|
| 27 |
+
log_level: Logging level as string (DEBUG, INFO, WARNING, ERROR).
|
| 28 |
+
api_url: Base URL for the pricing prediction API.
|
| 29 |
+
api_host: Host for the FastAPI server.
|
| 30 |
+
api_port: Port for the FastAPI server.
|
| 31 |
+
dashboard_port: Port for the Streamlit dashboard.
|
| 32 |
+
mlflow_tracking_uri: URI for MLflow tracking server.
|
| 33 |
+
data_dir: Directory for data files.
|
| 34 |
+
models_dir: Directory for serialized models.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
model_config = SettingsConfigDict(
|
| 38 |
+
env_file=".env",
|
| 39 |
+
env_file_encoding="utf-8",
|
| 40 |
+
case_sensitive=False,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
# Environment
|
| 44 |
+
environment: Environment = Environment.DEVELOPMENT
|
| 45 |
+
|
| 46 |
+
# Logging
|
| 47 |
+
log_level: str = "DEBUG"
|
| 48 |
+
|
| 49 |
+
# API
|
| 50 |
+
api_url: str = "http://localhost:8000"
|
| 51 |
+
api_host: str = "0.0.0.0"
|
| 52 |
+
api_port: int = 8000
|
| 53 |
+
|
| 54 |
+
# Dashboard
|
| 55 |
+
dashboard_port: int = 8501
|
| 56 |
+
|
| 57 |
+
# MLflow
|
| 58 |
+
mlflow_tracking_uri: str = "./mlruns"
|
| 59 |
+
|
| 60 |
+
# Paths
|
| 61 |
+
data_dir: Path = Path("data")
|
| 62 |
+
models_dir: Path = Path("models")
|
| 63 |
+
|
| 64 |
+
@property
|
| 65 |
+
def log_level_int(self) -> int:
|
| 66 |
+
"""Get logging level as integer.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
Logging level constant (e.g., logging.DEBUG, logging.INFO).
|
| 70 |
+
"""
|
| 71 |
+
return getattr(logging, self.log_level.upper(), logging.INFO)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# Logging config by environment (from ORCHESTRATION.md)
|
| 75 |
+
LOGGING_CONFIG: dict[Environment, dict[str, int | str]] = {
|
| 76 |
+
Environment.DEVELOPMENT: {
|
| 77 |
+
"level": logging.DEBUG,
|
| 78 |
+
"format": "%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s",
|
| 79 |
+
},
|
| 80 |
+
Environment.TESTING: {
|
| 81 |
+
"level": logging.WARNING,
|
| 82 |
+
"format": "%(levelname)s - %(message)s",
|
| 83 |
+
},
|
| 84 |
+
Environment.PRODUCTION: {
|
| 85 |
+
"level": logging.INFO,
|
| 86 |
+
"format": "%(asctime)s - %(levelname)s - %(message)s",
|
| 87 |
+
},
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def configure_logging(settings: Settings) -> None:
|
| 92 |
+
"""Configure logging based on environment.
|
| 93 |
+
|
| 94 |
+
Sets up the root logger with appropriate level and format based on
|
| 95 |
+
the current environment.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
settings: Application settings instance.
|
| 99 |
+
"""
|
| 100 |
+
config = LOGGING_CONFIG.get(
|
| 101 |
+
settings.environment, LOGGING_CONFIG[Environment.DEVELOPMENT]
|
| 102 |
+
)
|
| 103 |
+
logging.basicConfig(
|
| 104 |
+
level=config["level"],
|
| 105 |
+
format=config["format"],
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@lru_cache
|
| 110 |
+
def get_settings() -> Settings:
|
| 111 |
+
"""Get cached settings instance.
|
| 112 |
+
|
| 113 |
+
Uses lru_cache to ensure settings are loaded only once and reused
|
| 114 |
+
across the application.
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
Cached Settings instance.
|
| 118 |
+
"""
|
| 119 |
+
return Settings()
|
src/dashboard/__init__.py
ADDED
|
File without changes
|
src/dashboard/app.py
ADDED
|
@@ -0,0 +1,765 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Getaround Delay Analysis Dashboard.
|
| 2 |
+
|
| 3 |
+
Interactive dashboard to help PM decide on minimum delay threshold between rentals.
|
| 4 |
+
Answers key questions about delay impact and revenue implications.
|
| 5 |
+
Includes a pricing prediction section powered by the ML API.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
import httpx
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import plotly.express as px
|
| 14 |
+
import plotly.graph_objects as go
|
| 15 |
+
import streamlit as st
|
| 16 |
+
from plotly.subplots import make_subplots
|
| 17 |
+
|
| 18 |
+
from src.config.settings import get_settings
|
| 19 |
+
|
| 20 |
+
# Setup
|
| 21 |
+
DATA_PATH = (
|
| 22 |
+
Path(__file__).parent.parent.parent / "data" / "get_around_delay_analysis.csv"
|
| 23 |
+
)
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# Page config
|
| 27 |
+
st.set_page_config(
|
| 28 |
+
page_title="Getaround Delay Analysis",
|
| 29 |
+
page_icon="GA",
|
| 30 |
+
layout="wide",
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
st.title("Getaround Delay Analysis Dashboard")
|
| 34 |
+
st.markdown("*Helping PM decide on minimum delay threshold between rentals*")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@st.cache_data
|
| 38 |
+
def load_data() -> pd.DataFrame:
|
| 39 |
+
"""Load and cache the delay analysis data.
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
DataFrame with rental delay data.
|
| 43 |
+
"""
|
| 44 |
+
return pd.read_csv(DATA_PATH)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@st.cache_data
|
| 48 |
+
def prepare_data(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
| 49 |
+
"""Prepare derived datasets for analysis.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
df: Raw rental data.
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Tuple of (ended_rentals, consecutive_rentals, consecutive_with_delay).
|
| 56 |
+
"""
|
| 57 |
+
# Ended rentals with delay info
|
| 58 |
+
ended = df[df["state"] == "ended"].copy()
|
| 59 |
+
ended["is_late"] = ended["delay_at_checkout_in_minutes"] > 0
|
| 60 |
+
|
| 61 |
+
# Rentals with a previous rental on the same car
|
| 62 |
+
consecutive = df[df["previous_ended_rental_id"].notna()].copy()
|
| 63 |
+
|
| 64 |
+
# Merge to get previous rental delay info
|
| 65 |
+
prev_delays = df[["rental_id", "delay_at_checkout_in_minutes"]].copy()
|
| 66 |
+
prev_delays.columns = ["previous_ended_rental_id", "previous_delay"]
|
| 67 |
+
consecutive = consecutive.merge(
|
| 68 |
+
prev_delays, on="previous_ended_rental_id", how="left"
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Calculate effective time and problematic flag
|
| 72 |
+
consecutive["effective_time"] = consecutive[
|
| 73 |
+
"time_delta_with_previous_rental_in_minutes"
|
| 74 |
+
] - consecutive["previous_delay"].fillna(0)
|
| 75 |
+
consecutive["is_problematic"] = consecutive["effective_time"] < 0
|
| 76 |
+
|
| 77 |
+
return ended, consecutive, consecutive[consecutive["is_problematic"]]
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def simulate_threshold(data: pd.DataFrame, threshold_minutes: int, scope: str) -> dict:
|
| 81 |
+
"""Simulate the impact of a minimum delay threshold.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
data: DataFrame with consecutive rentals.
|
| 85 |
+
threshold_minutes: Minimum required gap between rentals.
|
| 86 |
+
scope: 'All rentals', 'Connect only', or 'Mobile only'.
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
Dict with simulation results.
|
| 90 |
+
"""
|
| 91 |
+
if scope == "Connect only":
|
| 92 |
+
subset = data[data["checkin_type"] == "connect"].copy()
|
| 93 |
+
elif scope == "Mobile only":
|
| 94 |
+
subset = data[data["checkin_type"] == "mobile"].copy()
|
| 95 |
+
else:
|
| 96 |
+
subset = data.copy()
|
| 97 |
+
|
| 98 |
+
if len(subset) == 0:
|
| 99 |
+
return {
|
| 100 |
+
"affected_rentals": 0,
|
| 101 |
+
"affected_pct": 0.0,
|
| 102 |
+
"total_problematic": 0,
|
| 103 |
+
"solved_cases": 0,
|
| 104 |
+
"solved_pct": 0.0,
|
| 105 |
+
"unsolved_cases": 0,
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
# Rentals affected (scheduled gap < threshold)
|
| 109 |
+
affected = subset[
|
| 110 |
+
subset["time_delta_with_previous_rental_in_minutes"] < threshold_minutes
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
+
# Problematic cases in subset
|
| 114 |
+
problematic = subset[subset["is_problematic"]]
|
| 115 |
+
|
| 116 |
+
# Solved = problematic cases with time_delta < threshold
|
| 117 |
+
solved = problematic[
|
| 118 |
+
problematic["time_delta_with_previous_rental_in_minutes"] < threshold_minutes
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
return {
|
| 122 |
+
"affected_rentals": len(affected),
|
| 123 |
+
"affected_pct": len(affected) / len(subset) * 100 if len(subset) > 0 else 0,
|
| 124 |
+
"total_problematic": len(problematic),
|
| 125 |
+
"solved_cases": len(solved),
|
| 126 |
+
"solved_pct": len(solved) / len(problematic) * 100
|
| 127 |
+
if len(problematic) > 0
|
| 128 |
+
else 0,
|
| 129 |
+
"unsolved_cases": len(problematic) - len(solved),
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# Load data
|
| 134 |
+
try:
|
| 135 |
+
df = load_data()
|
| 136 |
+
except FileNotFoundError:
|
| 137 |
+
st.error("Data file not found. Ensure data/get_around_delay_analysis.csv exists.")
|
| 138 |
+
st.stop()
|
| 139 |
+
except Exception as e:
|
| 140 |
+
st.error(f"Failed to load data: {e}")
|
| 141 |
+
st.stop()
|
| 142 |
+
|
| 143 |
+
ended, consecutive, problematic_cases = prepare_data(df)
|
| 144 |
+
|
| 145 |
+
# Sidebar - Controls
|
| 146 |
+
st.sidebar.header("Configuration")
|
| 147 |
+
scope = st.sidebar.selectbox(
|
| 148 |
+
"Scope",
|
| 149 |
+
options=["All rentals", "Connect only", "Mobile only"],
|
| 150 |
+
help="Which rental types to include in analysis",
|
| 151 |
+
)
|
| 152 |
+
threshold_minutes = st.sidebar.slider(
|
| 153 |
+
"Minimum delay threshold (minutes)",
|
| 154 |
+
min_value=0,
|
| 155 |
+
max_value=720,
|
| 156 |
+
value=60,
|
| 157 |
+
step=15,
|
| 158 |
+
help="Proposed minimum time between consecutive rentals",
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Filter data based on scope
|
| 162 |
+
if scope == "Connect only":
|
| 163 |
+
df_filtered = df[df["checkin_type"] == "connect"]
|
| 164 |
+
ended_filtered = ended[ended["checkin_type"] == "connect"]
|
| 165 |
+
consecutive_filtered = consecutive[consecutive["checkin_type"] == "connect"]
|
| 166 |
+
elif scope == "Mobile only":
|
| 167 |
+
df_filtered = df[df["checkin_type"] == "mobile"]
|
| 168 |
+
ended_filtered = ended[ended["checkin_type"] == "mobile"]
|
| 169 |
+
consecutive_filtered = consecutive[consecutive["checkin_type"] == "mobile"]
|
| 170 |
+
else:
|
| 171 |
+
df_filtered = df
|
| 172 |
+
ended_filtered = ended
|
| 173 |
+
consecutive_filtered = consecutive
|
| 174 |
+
|
| 175 |
+
# =============================================================================
|
| 176 |
+
# Section 1: Overview
|
| 177 |
+
# =============================================================================
|
| 178 |
+
st.header("1. Dataset Overview")
|
| 179 |
+
|
| 180 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 181 |
+
|
| 182 |
+
with col1:
|
| 183 |
+
st.metric("Total Rentals", f"{len(df_filtered):,}")
|
| 184 |
+
|
| 185 |
+
with col2:
|
| 186 |
+
ended_count = len(df_filtered[df_filtered["state"] == "ended"])
|
| 187 |
+
ended_pct = ended_count / len(df_filtered) * 100 if len(df_filtered) > 0 else 0
|
| 188 |
+
st.metric("Completed", f"{ended_count:,}", f"{ended_pct:.1f}%")
|
| 189 |
+
|
| 190 |
+
with col3:
|
| 191 |
+
canceled_count = len(df_filtered[df_filtered["state"] == "canceled"])
|
| 192 |
+
canceled_pct = (
|
| 193 |
+
canceled_count / len(df_filtered) * 100 if len(df_filtered) > 0 else 0
|
| 194 |
+
)
|
| 195 |
+
st.metric("Canceled", f"{canceled_count:,}", f"{canceled_pct:.1f}%")
|
| 196 |
+
|
| 197 |
+
with col4:
|
| 198 |
+
consecutive_count = len(consecutive_filtered)
|
| 199 |
+
consecutive_pct = (
|
| 200 |
+
consecutive_count / len(df_filtered) * 100 if len(df_filtered) > 0 else 0
|
| 201 |
+
)
|
| 202 |
+
st.metric(
|
| 203 |
+
"Consecutive Rentals", f"{consecutive_count:,}", f"{consecutive_pct:.1f}%"
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
# Rental type distribution chart
|
| 207 |
+
if scope == "All rentals":
|
| 208 |
+
col_chart1, col_chart2 = st.columns(2)
|
| 209 |
+
|
| 210 |
+
with col_chart1:
|
| 211 |
+
type_counts = df["checkin_type"].value_counts()
|
| 212 |
+
fig_type = px.pie(
|
| 213 |
+
values=type_counts.values,
|
| 214 |
+
names=type_counts.index,
|
| 215 |
+
title="Rentals by Checkin Type",
|
| 216 |
+
hole=0.4,
|
| 217 |
+
)
|
| 218 |
+
fig_type.update_traces(textinfo="label+percent")
|
| 219 |
+
st.plotly_chart(fig_type, use_container_width=True)
|
| 220 |
+
|
| 221 |
+
with col_chart2:
|
| 222 |
+
state_counts = df["state"].value_counts()
|
| 223 |
+
fig_state = px.pie(
|
| 224 |
+
values=state_counts.values,
|
| 225 |
+
names=state_counts.index,
|
| 226 |
+
title="Rentals by State",
|
| 227 |
+
hole=0.4,
|
| 228 |
+
)
|
| 229 |
+
fig_state.update_traces(textinfo="label+percent")
|
| 230 |
+
st.plotly_chart(fig_state, use_container_width=True)
|
| 231 |
+
|
| 232 |
+
# =============================================================================
|
| 233 |
+
# Section 2: Late Returns Analysis
|
| 234 |
+
# =============================================================================
|
| 235 |
+
st.header("2. Late Returns Analysis")
|
| 236 |
+
|
| 237 |
+
# Late return statistics
|
| 238 |
+
delay_data = ended_filtered["delay_at_checkout_in_minutes"].dropna()
|
| 239 |
+
late_count = (delay_data > 0).sum()
|
| 240 |
+
on_time_count = (delay_data <= 0).sum()
|
| 241 |
+
late_pct = late_count / len(delay_data) * 100 if len(delay_data) > 0 else 0
|
| 242 |
+
|
| 243 |
+
col1, col2, col3 = st.columns(3)
|
| 244 |
+
|
| 245 |
+
with col1:
|
| 246 |
+
st.metric("Late Returns", f"{late_count:,}", f"{late_pct:.1f}%")
|
| 247 |
+
|
| 248 |
+
with col2:
|
| 249 |
+
median_delay = delay_data.median() if len(delay_data) > 0 else 0
|
| 250 |
+
st.metric("Median Delay", f"{median_delay:.0f} min")
|
| 251 |
+
|
| 252 |
+
with col3:
|
| 253 |
+
mean_delay = delay_data.mean() if len(delay_data) > 0 else 0
|
| 254 |
+
st.metric("Mean Delay", f"{mean_delay:.0f} min")
|
| 255 |
+
|
| 256 |
+
# Visualizations
|
| 257 |
+
col_viz1, col_viz2 = st.columns(2)
|
| 258 |
+
|
| 259 |
+
with col_viz1:
|
| 260 |
+
# Pie chart: on-time vs late
|
| 261 |
+
fig_late = px.pie(
|
| 262 |
+
values=[on_time_count, late_count],
|
| 263 |
+
names=["On time or early", "Late"],
|
| 264 |
+
title="Return Timing Distribution",
|
| 265 |
+
hole=0.4,
|
| 266 |
+
color_discrete_sequence=["#2ecc71", "#e74c3c"],
|
| 267 |
+
)
|
| 268 |
+
fig_late.update_traces(textinfo="label+percent")
|
| 269 |
+
st.plotly_chart(fig_late, use_container_width=True)
|
| 270 |
+
|
| 271 |
+
with col_viz2:
|
| 272 |
+
# Histogram of delay distribution
|
| 273 |
+
delay_capped = delay_data.clip(-120, 360)
|
| 274 |
+
fig_hist = px.histogram(
|
| 275 |
+
delay_capped,
|
| 276 |
+
nbins=50,
|
| 277 |
+
title="Delay Distribution (capped at -2h to +6h)",
|
| 278 |
+
labels={"value": "Delay (minutes)", "count": "Number of rentals"},
|
| 279 |
+
)
|
| 280 |
+
fig_hist.add_vline(x=0, line_dash="dash", line_color="red")
|
| 281 |
+
fig_hist.update_layout(showlegend=False)
|
| 282 |
+
st.plotly_chart(fig_hist, use_container_width=True)
|
| 283 |
+
|
| 284 |
+
# Stats by rental type (only if showing all)
|
| 285 |
+
if scope == "All rentals":
|
| 286 |
+
st.subheader("Late Returns by Checkin Type")
|
| 287 |
+
|
| 288 |
+
late_by_type = ended.groupby("checkin_type").agg(
|
| 289 |
+
total=("rental_id", "count"),
|
| 290 |
+
late_count=("is_late", "sum"),
|
| 291 |
+
mean_delay=("delay_at_checkout_in_minutes", "mean"),
|
| 292 |
+
median_delay=("delay_at_checkout_in_minutes", "median"),
|
| 293 |
+
)
|
| 294 |
+
late_by_type["late_rate"] = late_by_type["late_count"] / late_by_type["total"] * 100
|
| 295 |
+
late_by_type = late_by_type.round(1)
|
| 296 |
+
|
| 297 |
+
st.dataframe(
|
| 298 |
+
late_by_type[
|
| 299 |
+
["total", "late_count", "late_rate", "mean_delay", "median_delay"]
|
| 300 |
+
],
|
| 301 |
+
use_container_width=True,
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
# =============================================================================
|
| 305 |
+
# Section 3: Impact on Next Rental
|
| 306 |
+
# =============================================================================
|
| 307 |
+
st.header("3. Impact on Next Driver")
|
| 308 |
+
|
| 309 |
+
st.markdown(
|
| 310 |
+
"""
|
| 311 |
+
Analysis of consecutive rentals on the same car
|
| 312 |
+
and how delays affect the next driver.
|
| 313 |
+
"""
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
# Key metrics
|
| 317 |
+
problematic_in_scope = consecutive_filtered[consecutive_filtered["is_problematic"]]
|
| 318 |
+
prob_count = len(problematic_in_scope)
|
| 319 |
+
prob_pct = (
|
| 320 |
+
prob_count / len(consecutive_filtered) * 100 if len(consecutive_filtered) > 0 else 0
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
col1, col2, col3 = st.columns(3)
|
| 324 |
+
|
| 325 |
+
with col1:
|
| 326 |
+
st.metric(
|
| 327 |
+
"Consecutive Rentals",
|
| 328 |
+
f"{len(consecutive_filtered):,}",
|
| 329 |
+
f"{len(consecutive_filtered) / len(df_filtered) * 100:.1f}% of total"
|
| 330 |
+
if len(df_filtered) > 0
|
| 331 |
+
else "0%",
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
with col2:
|
| 335 |
+
time_delta = consecutive_filtered["time_delta_with_previous_rental_in_minutes"]
|
| 336 |
+
median_gap = time_delta.median() if len(time_delta) > 0 else 0
|
| 337 |
+
st.metric("Median Gap Between Rentals", f"{median_gap:.0f} min")
|
| 338 |
+
|
| 339 |
+
with col3:
|
| 340 |
+
st.metric(
|
| 341 |
+
"Problematic Cases",
|
| 342 |
+
f"{prob_count:,}",
|
| 343 |
+
f"{prob_pct:.1f}% of consecutive",
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
# Time between consecutive rentals
|
| 347 |
+
col_viz1, col_viz2 = st.columns(2)
|
| 348 |
+
|
| 349 |
+
with col_viz1:
|
| 350 |
+
fig_delta = px.histogram(
|
| 351 |
+
time_delta.clip(0, 720),
|
| 352 |
+
nbins=40,
|
| 353 |
+
title="Scheduled Gap Between Consecutive Rentals",
|
| 354 |
+
labels={"value": "Time Delta (minutes)", "count": "Count"},
|
| 355 |
+
)
|
| 356 |
+
fig_delta.add_vline(
|
| 357 |
+
x=threshold_minutes,
|
| 358 |
+
line_dash="dash",
|
| 359 |
+
line_color="red",
|
| 360 |
+
annotation_text=f"Threshold: {threshold_minutes}min",
|
| 361 |
+
)
|
| 362 |
+
fig_delta.update_layout(showlegend=False)
|
| 363 |
+
st.plotly_chart(fig_delta, use_container_width=True)
|
| 364 |
+
|
| 365 |
+
with col_viz2:
|
| 366 |
+
# Wait time for impacted drivers
|
| 367 |
+
if len(problematic_in_scope) > 0:
|
| 368 |
+
wait_time = -problematic_in_scope["effective_time"]
|
| 369 |
+
fig_wait = px.histogram(
|
| 370 |
+
wait_time.clip(0, 240),
|
| 371 |
+
nbins=30,
|
| 372 |
+
title="Wait Time for Impacted Drivers",
|
| 373 |
+
labels={"value": "Wait Time (minutes)", "count": "Count"},
|
| 374 |
+
color_discrete_sequence=["#e74c3c"],
|
| 375 |
+
)
|
| 376 |
+
fig_wait.update_layout(showlegend=False)
|
| 377 |
+
st.plotly_chart(fig_wait, use_container_width=True)
|
| 378 |
+
else:
|
| 379 |
+
st.info("No problematic cases in the selected scope.")
|
| 380 |
+
|
| 381 |
+
# Problematic rate by type
|
| 382 |
+
if scope == "All rentals":
|
| 383 |
+
st.subheader("Problematic Cases by Checkin Type")
|
| 384 |
+
|
| 385 |
+
prob_by_type = consecutive.groupby("checkin_type").agg(
|
| 386 |
+
total=("rental_id", "count"),
|
| 387 |
+
problematic=("is_problematic", "sum"),
|
| 388 |
+
)
|
| 389 |
+
prob_by_type["rate_pct"] = (
|
| 390 |
+
prob_by_type["problematic"] / prob_by_type["total"] * 100
|
| 391 |
+
).round(1)
|
| 392 |
+
|
| 393 |
+
st.dataframe(prob_by_type, use_container_width=True)
|
| 394 |
+
|
| 395 |
+
# =============================================================================
|
| 396 |
+
# Section 4: Threshold Simulation
|
| 397 |
+
# =============================================================================
|
| 398 |
+
st.header("4. Threshold Impact Simulation")
|
| 399 |
+
|
| 400 |
+
st.markdown(
|
| 401 |
+
f"""
|
| 402 |
+
Simulating impact of **{threshold_minutes}-minute** threshold on **{scope}**.
|
| 403 |
+
"""
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
# Current threshold simulation
|
| 407 |
+
sim_result = simulate_threshold(consecutive, threshold_minutes, scope)
|
| 408 |
+
|
| 409 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 410 |
+
|
| 411 |
+
with col1:
|
| 412 |
+
st.metric(
|
| 413 |
+
"Affected Rentals",
|
| 414 |
+
f"{sim_result['affected_rentals']:,}",
|
| 415 |
+
f"{sim_result['affected_pct']:.1f}%",
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
with col2:
|
| 419 |
+
st.metric(
|
| 420 |
+
"Total Problematic",
|
| 421 |
+
f"{sim_result['total_problematic']:,}",
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
with col3:
|
| 425 |
+
st.metric(
|
| 426 |
+
"Problems Solved",
|
| 427 |
+
f"{sim_result['solved_cases']:,}",
|
| 428 |
+
f"{sim_result['solved_pct']:.1f}%",
|
| 429 |
+
)
|
| 430 |
+
|
| 431 |
+
with col4:
|
| 432 |
+
st.metric(
|
| 433 |
+
"Unsolved Cases",
|
| 434 |
+
f"{sim_result['unsolved_cases']:,}",
|
| 435 |
+
)
|
| 436 |
+
|
| 437 |
+
# Revenue impact estimation
|
| 438 |
+
st.subheader("Revenue Impact Estimation")
|
| 439 |
+
|
| 440 |
+
total_rentals = len(df)
|
| 441 |
+
affected_pct_total = sim_result["affected_rentals"] / total_rentals * 100
|
| 442 |
+
|
| 443 |
+
st.info(
|
| 444 |
+
f"""
|
| 445 |
+
**Share of owner revenue potentially affected:** {affected_pct_total:.2f}%
|
| 446 |
+
|
| 447 |
+
- Consecutive rentals represent \
|
| 448 |
+
{len(consecutive) / total_rentals * 100:.1f}% of all rentals
|
| 449 |
+
- With {threshold_minutes}-minute threshold on {scope.lower()}:
|
| 450 |
+
{sim_result["affected_rentals"]:,} rentals would need to be rescheduled
|
| 451 |
+
"""
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
# Comparison table for different thresholds
|
| 455 |
+
st.subheader("Threshold Comparison")
|
| 456 |
+
|
| 457 |
+
thresholds = [15, 30, 60, 90, 120, 180, 240, 360]
|
| 458 |
+
comparison_data = []
|
| 459 |
+
|
| 460 |
+
for t in thresholds:
|
| 461 |
+
result = simulate_threshold(consecutive, t, scope)
|
| 462 |
+
comparison_data.append(
|
| 463 |
+
{
|
| 464 |
+
"Threshold (min)": t,
|
| 465 |
+
"Affected Rentals": result["affected_rentals"],
|
| 466 |
+
"Affected %": f"{result['affected_pct']:.1f}%",
|
| 467 |
+
"Problems Solved": result["solved_cases"],
|
| 468 |
+
"Solved %": f"{result['solved_pct']:.1f}%",
|
| 469 |
+
"Unsolved": result["unsolved_cases"],
|
| 470 |
+
}
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
comparison_df = pd.DataFrame(comparison_data)
|
| 474 |
+
st.dataframe(comparison_df, use_container_width=True, hide_index=True)
|
| 475 |
+
|
| 476 |
+
# Trade-off visualization
|
| 477 |
+
st.subheader("Trade-off: Affected vs Solved")
|
| 478 |
+
|
| 479 |
+
tradeoff_data = []
|
| 480 |
+
for t in thresholds:
|
| 481 |
+
for s in ["All rentals", "Connect only"]:
|
| 482 |
+
result = simulate_threshold(consecutive, t, s)
|
| 483 |
+
tradeoff_data.append(
|
| 484 |
+
{
|
| 485 |
+
"threshold": t,
|
| 486 |
+
"scope": s,
|
| 487 |
+
"affected_pct": result["affected_pct"],
|
| 488 |
+
"solved_pct": result["solved_pct"],
|
| 489 |
+
}
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
tradeoff_df = pd.DataFrame(tradeoff_data)
|
| 493 |
+
|
| 494 |
+
fig_tradeoff = make_subplots(
|
| 495 |
+
rows=1,
|
| 496 |
+
cols=2,
|
| 497 |
+
subplot_titles=["All Rentals", "Connect Only"],
|
| 498 |
+
)
|
| 499 |
+
|
| 500 |
+
for i, s in enumerate(["All rentals", "Connect only"]):
|
| 501 |
+
scope_data = tradeoff_df[tradeoff_df["scope"] == s]
|
| 502 |
+
|
| 503 |
+
fig_tradeoff.add_trace(
|
| 504 |
+
go.Scatter(
|
| 505 |
+
x=scope_data["threshold"],
|
| 506 |
+
y=scope_data["affected_pct"],
|
| 507 |
+
name="Affected %",
|
| 508 |
+
mode="lines+markers",
|
| 509 |
+
line=dict(color="#e74c3c"),
|
| 510 |
+
showlegend=(i == 0),
|
| 511 |
+
),
|
| 512 |
+
row=1,
|
| 513 |
+
col=i + 1,
|
| 514 |
+
)
|
| 515 |
+
fig_tradeoff.add_trace(
|
| 516 |
+
go.Scatter(
|
| 517 |
+
x=scope_data["threshold"],
|
| 518 |
+
y=scope_data["solved_pct"],
|
| 519 |
+
name="Solved %",
|
| 520 |
+
mode="lines+markers",
|
| 521 |
+
line=dict(color="#2ecc71"),
|
| 522 |
+
showlegend=(i == 0),
|
| 523 |
+
),
|
| 524 |
+
row=1,
|
| 525 |
+
col=i + 1,
|
| 526 |
+
)
|
| 527 |
+
|
| 528 |
+
fig_tradeoff.update_xaxes(title_text="Threshold (minutes)")
|
| 529 |
+
fig_tradeoff.update_yaxes(title_text="Percentage")
|
| 530 |
+
fig_tradeoff.update_layout(height=400)
|
| 531 |
+
|
| 532 |
+
st.plotly_chart(fig_tradeoff, use_container_width=True)
|
| 533 |
+
|
| 534 |
+
# =============================================================================
|
| 535 |
+
# Section 5: Recommendations
|
| 536 |
+
# =============================================================================
|
| 537 |
+
st.header("5. Key Findings and Recommendations")
|
| 538 |
+
|
| 539 |
+
st.markdown(
|
| 540 |
+
"""
|
| 541 |
+
### Data Insights
|
| 542 |
+
|
| 543 |
+
| Metric | Value |
|
| 544 |
+
|--------|-------|
|
| 545 |
+
| Total rentals | {:,} |
|
| 546 |
+
| Mobile checkin share | {:.1f}% |
|
| 547 |
+
| Connect checkin share | {:.1f}% |
|
| 548 |
+
| Late return rate | {:.1f}% |
|
| 549 |
+
| Consecutive rentals | {:.1f}% of total |
|
| 550 |
+
| Problematic cases | {:.1f}% of consecutive |
|
| 551 |
+
|
| 552 |
+
### Recommendations
|
| 553 |
+
|
| 554 |
+
| Strategy | Threshold | Scope | Trade-off |
|
| 555 |
+
|----------|-----------|-------|-----------|
|
| 556 |
+
| Conservative | 60 min | Connect | ~45% solved, low impact |
|
| 557 |
+
| Balanced | 120 min | Connect | ~70% solved, moderate |
|
| 558 |
+
| Aggressive | 180 min | All | ~85% solved, high impact |
|
| 559 |
+
|
| 560 |
+
### Key Takeaways
|
| 561 |
+
|
| 562 |
+
1. **Connect-only scope is safer**: Affects only 20% of rentals while addressing
|
| 563 |
+
higher problematic rate (12% vs 8%)
|
| 564 |
+
|
| 565 |
+
2. **60-minute threshold is a good starting point**: Minimal revenue impact with
|
| 566 |
+
meaningful improvement in driver experience
|
| 567 |
+
|
| 568 |
+
3. **Problematic cases are relatively rare**: Only ~1.5% of all rentals are affected
|
| 569 |
+
by previous rental delays
|
| 570 |
+
|
| 571 |
+
4. **Mobile rentals have more volume but lower problematic rate**: Consider phased
|
| 572 |
+
rollout starting with Connect
|
| 573 |
+
""".format(
|
| 574 |
+
len(df),
|
| 575 |
+
len(df[df["checkin_type"] == "mobile"]) / len(df) * 100,
|
| 576 |
+
len(df[df["checkin_type"] == "connect"]) / len(df) * 100,
|
| 577 |
+
(ended["is_late"].sum() / len(ended) * 100),
|
| 578 |
+
len(consecutive) / len(df) * 100,
|
| 579 |
+
len(problematic_cases) / len(consecutive) * 100 if len(consecutive) > 0 else 0,
|
| 580 |
+
)
|
| 581 |
+
)
|
| 582 |
+
|
| 583 |
+
# =============================================================================
|
| 584 |
+
# Section 6: Pricing Prediction
|
| 585 |
+
# =============================================================================
|
| 586 |
+
st.header("6. Pricing Prediction")
|
| 587 |
+
st.markdown(
|
| 588 |
+
"Use the form below to estimate the daily rental price "
|
| 589 |
+
"for a car based on its features."
|
| 590 |
+
)
|
| 591 |
+
|
| 592 |
+
settings = get_settings()
|
| 593 |
+
api_url = settings.api_url
|
| 594 |
+
|
| 595 |
+
st.markdown(f"API Documentation: [{api_url}/docs]({api_url}/docs)")
|
| 596 |
+
|
| 597 |
+
with st.form("prediction_form"):
|
| 598 |
+
# Row 1 - Main characteristics
|
| 599 |
+
r1c1, r1c2, r1c3 = st.columns(3)
|
| 600 |
+
|
| 601 |
+
with r1c1:
|
| 602 |
+
model_key = st.selectbox(
|
| 603 |
+
"Brand",
|
| 604 |
+
options=[
|
| 605 |
+
"Alfa Romeo",
|
| 606 |
+
"Audi",
|
| 607 |
+
"BMW",
|
| 608 |
+
"Citroen",
|
| 609 |
+
"Fiat",
|
| 610 |
+
"Ford",
|
| 611 |
+
"KIA",
|
| 612 |
+
"Lamborghini",
|
| 613 |
+
"Lexus",
|
| 614 |
+
"Maserati",
|
| 615 |
+
"Mercedes",
|
| 616 |
+
"Mini",
|
| 617 |
+
"Mitsubishi",
|
| 618 |
+
"Nissan",
|
| 619 |
+
"Opel",
|
| 620 |
+
"PGO",
|
| 621 |
+
"Peugeot",
|
| 622 |
+
"Porsche",
|
| 623 |
+
"Renault",
|
| 624 |
+
"SEAT",
|
| 625 |
+
"Subaru",
|
| 626 |
+
"Suzuki",
|
| 627 |
+
"Toyota",
|
| 628 |
+
"Volkswagen",
|
| 629 |
+
"Yamaha",
|
| 630 |
+
],
|
| 631 |
+
index=16, # Peugeot
|
| 632 |
+
)
|
| 633 |
+
|
| 634 |
+
with r1c2:
|
| 635 |
+
fuel = st.selectbox(
|
| 636 |
+
"Fuel",
|
| 637 |
+
options=[
|
| 638 |
+
"diesel",
|
| 639 |
+
"petrol",
|
| 640 |
+
"hybrid_petrol",
|
| 641 |
+
"electro",
|
| 642 |
+
],
|
| 643 |
+
index=0,
|
| 644 |
+
)
|
| 645 |
+
|
| 646 |
+
with r1c3:
|
| 647 |
+
car_type = st.selectbox(
|
| 648 |
+
"Car Type",
|
| 649 |
+
options=[
|
| 650 |
+
"sedan",
|
| 651 |
+
"hatchback",
|
| 652 |
+
"suv",
|
| 653 |
+
"van",
|
| 654 |
+
"estate",
|
| 655 |
+
"convertible",
|
| 656 |
+
"coupe",
|
| 657 |
+
"subcompact",
|
| 658 |
+
],
|
| 659 |
+
index=0,
|
| 660 |
+
)
|
| 661 |
+
|
| 662 |
+
# Row 2 - Specifications
|
| 663 |
+
r2c1, r2c2, r2c3 = st.columns(3)
|
| 664 |
+
|
| 665 |
+
with r2c1:
|
| 666 |
+
mileage = st.number_input(
|
| 667 |
+
"Mileage (km)",
|
| 668 |
+
min_value=0,
|
| 669 |
+
max_value=500_000,
|
| 670 |
+
value=100_000,
|
| 671 |
+
step=5_000,
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
with r2c2:
|
| 675 |
+
engine_power = st.number_input(
|
| 676 |
+
"Engine Power (hp)",
|
| 677 |
+
min_value=10,
|
| 678 |
+
max_value=500,
|
| 679 |
+
value=120,
|
| 680 |
+
step=10,
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
with r2c3:
|
| 684 |
+
paint_color = st.selectbox(
|
| 685 |
+
"Paint Color",
|
| 686 |
+
options=[
|
| 687 |
+
"black",
|
| 688 |
+
"white",
|
| 689 |
+
"grey",
|
| 690 |
+
"silver",
|
| 691 |
+
"blue",
|
| 692 |
+
"red",
|
| 693 |
+
"beige",
|
| 694 |
+
"brown",
|
| 695 |
+
"green",
|
| 696 |
+
"orange",
|
| 697 |
+
],
|
| 698 |
+
index=0,
|
| 699 |
+
)
|
| 700 |
+
|
| 701 |
+
# Row 3 - Equipment
|
| 702 |
+
st.markdown("**Equipment**")
|
| 703 |
+
r3c1, r3c2, r3c3, r3c4 = st.columns(4)
|
| 704 |
+
|
| 705 |
+
with r3c1:
|
| 706 |
+
private_parking = st.checkbox("Private parking", value=False)
|
| 707 |
+
has_gps = st.checkbox("GPS", value=False)
|
| 708 |
+
|
| 709 |
+
with r3c2:
|
| 710 |
+
has_ac = st.checkbox("Air conditioning", value=True)
|
| 711 |
+
automatic = st.checkbox("Automatic transmission", value=False)
|
| 712 |
+
|
| 713 |
+
with r3c3:
|
| 714 |
+
has_connect = st.checkbox("Getaround Connect", value=False)
|
| 715 |
+
has_regulator = st.checkbox("Speed regulator", value=False)
|
| 716 |
+
|
| 717 |
+
with r3c4:
|
| 718 |
+
winter_tires = st.checkbox("Winter tires", value=False)
|
| 719 |
+
|
| 720 |
+
submitted = st.form_submit_button("Predict Price")
|
| 721 |
+
|
| 722 |
+
if submitted:
|
| 723 |
+
payload = {
|
| 724 |
+
"cars": [
|
| 725 |
+
{
|
| 726 |
+
"model_key": model_key,
|
| 727 |
+
"mileage": mileage,
|
| 728 |
+
"engine_power": engine_power,
|
| 729 |
+
"fuel": fuel,
|
| 730 |
+
"paint_color": paint_color,
|
| 731 |
+
"car_type": car_type,
|
| 732 |
+
"private_parking_available": private_parking,
|
| 733 |
+
"has_gps": has_gps,
|
| 734 |
+
"has_air_conditioning": has_ac,
|
| 735 |
+
"automatic_car": automatic,
|
| 736 |
+
"has_getaround_connect": has_connect,
|
| 737 |
+
"has_speed_regulator": has_regulator,
|
| 738 |
+
"winter_tires": winter_tires,
|
| 739 |
+
}
|
| 740 |
+
]
|
| 741 |
+
}
|
| 742 |
+
|
| 743 |
+
try:
|
| 744 |
+
response = httpx.post(
|
| 745 |
+
f"{api_url}/predict",
|
| 746 |
+
json=payload,
|
| 747 |
+
timeout=10.0,
|
| 748 |
+
)
|
| 749 |
+
response.raise_for_status()
|
| 750 |
+
result = response.json()
|
| 751 |
+
price = result["prediction"][0]
|
| 752 |
+
st.success(f"Estimated daily rental price: {price} EUR/day")
|
| 753 |
+
except httpx.ConnectError:
|
| 754 |
+
st.error(
|
| 755 |
+
f"Cannot connect to API at {api_url}. Ensure the API server is running."
|
| 756 |
+
)
|
| 757 |
+
except httpx.HTTPStatusError as exc:
|
| 758 |
+
st.error(f"API error (HTTP {exc.response.status_code}): {exc.response.text}")
|
| 759 |
+
except Exception as exc:
|
| 760 |
+
logger.exception("Prediction request failed")
|
| 761 |
+
st.error(f"Prediction failed: {exc}")
|
| 762 |
+
|
| 763 |
+
# Footer
|
| 764 |
+
st.markdown("---")
|
| 765 |
+
st.caption("Dashboard built for Getaround PM team")
|
src/ml/__init__.py
ADDED
|
File without changes
|
src/ml/predict.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model inference for pricing predictions."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Union
|
| 6 |
+
|
| 7 |
+
import joblib
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
DEFAULT_MODEL_PATH = (
|
| 13 |
+
Path(__file__).parent.parent.parent / "models" / "best_model.joblib"
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
_predictor_instance: "PricingPredictor | None" = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ModelNotFoundError(Exception):
|
| 20 |
+
"""Raised when the model file cannot be found."""
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class PricingPredictor:
|
| 24 |
+
"""Predictor class for car rental pricing."""
|
| 25 |
+
|
| 26 |
+
def __init__(self, model_path: Union[str, Path] = DEFAULT_MODEL_PATH) -> None:
|
| 27 |
+
"""Initialize predictor with trained model.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
model_path: Path to the trained model file.
|
| 31 |
+
|
| 32 |
+
Raises:
|
| 33 |
+
ModelNotFoundError: If the model file does not exist.
|
| 34 |
+
"""
|
| 35 |
+
self.model_path = Path(model_path)
|
| 36 |
+
self.model = None
|
| 37 |
+
self._load_model()
|
| 38 |
+
|
| 39 |
+
def _load_model(self) -> None:
|
| 40 |
+
"""Load model from disk.
|
| 41 |
+
|
| 42 |
+
Raises:
|
| 43 |
+
ModelNotFoundError: If the model file does not exist.
|
| 44 |
+
"""
|
| 45 |
+
if not self.model_path.exists():
|
| 46 |
+
logger.error("Model file not found: %s", self.model_path)
|
| 47 |
+
raise ModelNotFoundError(f"Model not found at {self.model_path}")
|
| 48 |
+
|
| 49 |
+
logger.info("Loading model from %s", self.model_path)
|
| 50 |
+
self.model = joblib.load(self.model_path)
|
| 51 |
+
logger.info("Model loaded successfully")
|
| 52 |
+
|
| 53 |
+
def predict_from_dict(self, data: dict) -> list[int]:
|
| 54 |
+
"""Make predictions from dictionary input.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
data: Dictionary with feature names as keys and list values.
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
List of predicted prices.
|
| 61 |
+
"""
|
| 62 |
+
df = pd.DataFrame(data)
|
| 63 |
+
predictions = self.model.predict(df)
|
| 64 |
+
return [int(round(p)) for p in predictions]
|
| 65 |
+
|
| 66 |
+
def predict_from_features(self, cars: list[dict]) -> list[int]:
|
| 67 |
+
"""Make predictions from car feature dictionaries.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
cars: List of dictionaries with car features.
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
List of predicted prices (rounded to int).
|
| 74 |
+
"""
|
| 75 |
+
logger.debug("Predicting for %d cars", len(cars))
|
| 76 |
+
df = pd.DataFrame(cars)
|
| 77 |
+
predictions = self.model.predict(df)
|
| 78 |
+
return [int(round(p)) for p in predictions]
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def get_predictor(
|
| 82 |
+
model_path: Union[str, Path] = DEFAULT_MODEL_PATH,
|
| 83 |
+
) -> PricingPredictor:
|
| 84 |
+
"""Get singleton predictor instance.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
model_path: Path to the trained model file.
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Singleton PricingPredictor instance.
|
| 91 |
+
"""
|
| 92 |
+
global _predictor_instance
|
| 93 |
+
if _predictor_instance is None:
|
| 94 |
+
_predictor_instance = PricingPredictor(model_path)
|
| 95 |
+
return _predictor_instance
|
src/ml/preprocessing.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Feature engineering and preprocessing for pricing model."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Tuple
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from sklearn.compose import ColumnTransformer
|
| 8 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Feature definitions based on EDA
|
| 14 |
+
CATEGORICAL_FEATURES = [
|
| 15 |
+
"model_key",
|
| 16 |
+
"fuel",
|
| 17 |
+
"paint_color",
|
| 18 |
+
"car_type",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
BOOLEAN_FEATURES = [
|
| 22 |
+
"private_parking_available",
|
| 23 |
+
"has_gps",
|
| 24 |
+
"has_air_conditioning",
|
| 25 |
+
"automatic_car",
|
| 26 |
+
"has_getaround_connect",
|
| 27 |
+
"has_speed_regulator",
|
| 28 |
+
"winter_tires",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
NUMERICAL_FEATURES = [
|
| 32 |
+
"mileage",
|
| 33 |
+
"engine_power",
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
TARGET = "rental_price_per_day"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def load_data(filepath: str) -> pd.DataFrame:
|
| 40 |
+
"""Load pricing dataset from CSV.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
filepath: Path to the CSV file.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
DataFrame with loaded data.
|
| 47 |
+
|
| 48 |
+
Raises:
|
| 49 |
+
FileNotFoundError: If file does not exist.
|
| 50 |
+
pd.errors.ParserError: If CSV parsing fails.
|
| 51 |
+
"""
|
| 52 |
+
logger.info("Loading data from %s", filepath)
|
| 53 |
+
df = pd.read_csv(filepath, index_col=0)
|
| 54 |
+
logger.info("Loaded %d rows, %d columns", df.shape[0], df.shape[1])
|
| 55 |
+
return df
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def create_preprocessor() -> ColumnTransformer:
|
| 59 |
+
"""Create sklearn preprocessor for features.
|
| 60 |
+
|
| 61 |
+
The preprocessor applies:
|
| 62 |
+
- StandardScaler to numerical features (mileage, engine_power)
|
| 63 |
+
- Passthrough for boolean features (already 0/1)
|
| 64 |
+
- OneHotEncoder for categorical features (model_key, fuel, paint_color, car_type)
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
ColumnTransformer configured for all feature types.
|
| 68 |
+
"""
|
| 69 |
+
preprocessor = ColumnTransformer(
|
| 70 |
+
transformers=[
|
| 71 |
+
(
|
| 72 |
+
"num",
|
| 73 |
+
StandardScaler(),
|
| 74 |
+
NUMERICAL_FEATURES,
|
| 75 |
+
),
|
| 76 |
+
(
|
| 77 |
+
"bool",
|
| 78 |
+
"passthrough",
|
| 79 |
+
BOOLEAN_FEATURES,
|
| 80 |
+
),
|
| 81 |
+
(
|
| 82 |
+
"cat",
|
| 83 |
+
OneHotEncoder(handle_unknown="ignore", sparse_output=False),
|
| 84 |
+
CATEGORICAL_FEATURES,
|
| 85 |
+
),
|
| 86 |
+
],
|
| 87 |
+
remainder="drop",
|
| 88 |
+
)
|
| 89 |
+
logger.debug(
|
| 90 |
+
"Created preprocessor with %d transformers", len(preprocessor.transformers)
|
| 91 |
+
)
|
| 92 |
+
return preprocessor
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
|
| 96 |
+
"""Split dataframe into features X and target y.
|
| 97 |
+
|
| 98 |
+
Converts boolean columns to int (0/1) for sklearn compatibility.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
df: DataFrame with all columns including target.
|
| 102 |
+
|
| 103 |
+
Returns:
|
| 104 |
+
Tuple of (X, y) where X is features DataFrame and y is target Series.
|
| 105 |
+
|
| 106 |
+
Raises:
|
| 107 |
+
KeyError: If required columns are missing.
|
| 108 |
+
"""
|
| 109 |
+
required_cols = (
|
| 110 |
+
NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES + [TARGET]
|
| 111 |
+
)
|
| 112 |
+
missing_cols = set(required_cols) - set(df.columns)
|
| 113 |
+
if missing_cols:
|
| 114 |
+
raise KeyError(f"Missing columns: {missing_cols}")
|
| 115 |
+
|
| 116 |
+
feature_cols = NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES
|
| 117 |
+
X = df[feature_cols].copy()
|
| 118 |
+
|
| 119 |
+
for col in BOOLEAN_FEATURES:
|
| 120 |
+
X[col] = X[col].astype(int)
|
| 121 |
+
|
| 122 |
+
y = df[TARGET].copy()
|
| 123 |
+
|
| 124 |
+
logger.info("Prepared features: X shape %s, y shape %s", X.shape, y.shape)
|
| 125 |
+
return X, y
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def get_feature_names() -> list[str]:
|
| 129 |
+
"""Return list of all feature names used.
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
List of feature names in order: numerical, boolean, categorical.
|
| 133 |
+
"""
|
| 134 |
+
return NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES
|
src/ml/train.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Model training with MLflow tracking."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import joblib
|
| 7 |
+
import mlflow
|
| 8 |
+
import mlflow.sklearn
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
|
| 12 |
+
from sklearn.linear_model import LinearRegression
|
| 13 |
+
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
| 14 |
+
from sklearn.model_selection import train_test_split
|
| 15 |
+
from sklearn.pipeline import Pipeline
|
| 16 |
+
|
| 17 |
+
from src.ml.preprocessing import (
|
| 18 |
+
create_preprocessor,
|
| 19 |
+
load_data,
|
| 20 |
+
prepare_features,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
MODELS = {
|
| 26 |
+
"linear_regression": LinearRegression(),
|
| 27 |
+
"random_forest": RandomForestRegressor(n_estimators=100, random_state=42),
|
| 28 |
+
"gradient_boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
MODELS_DIR = Path(__file__).parent.parent.parent / "models"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def train_and_evaluate(
|
| 35 |
+
X_train: pd.DataFrame,
|
| 36 |
+
X_test: pd.DataFrame,
|
| 37 |
+
y_train: pd.Series,
|
| 38 |
+
y_test: pd.Series,
|
| 39 |
+
model_name: str,
|
| 40 |
+
model,
|
| 41 |
+
preprocessor,
|
| 42 |
+
) -> dict:
|
| 43 |
+
"""Train model and return metrics.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
X_train: Training features.
|
| 47 |
+
X_test: Test features.
|
| 48 |
+
y_train: Training target.
|
| 49 |
+
y_test: Test target.
|
| 50 |
+
model_name: Name of the model for logging.
|
| 51 |
+
model: Sklearn estimator instance.
|
| 52 |
+
preprocessor: Sklearn preprocessor (ColumnTransformer).
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
Dictionary with model name, pipeline, and metrics (rmse, mae, r2).
|
| 56 |
+
"""
|
| 57 |
+
pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])
|
| 58 |
+
|
| 59 |
+
logger.info("Training %s...", model_name)
|
| 60 |
+
pipeline.fit(X_train, y_train)
|
| 61 |
+
|
| 62 |
+
y_pred = pipeline.predict(X_test)
|
| 63 |
+
|
| 64 |
+
rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
|
| 65 |
+
mae = float(mean_absolute_error(y_test, y_pred))
|
| 66 |
+
r2 = float(r2_score(y_test, y_pred))
|
| 67 |
+
|
| 68 |
+
logger.info("%s - RMSE: %.2f, MAE: %.2f, R2: %.3f", model_name, rmse, mae, r2)
|
| 69 |
+
|
| 70 |
+
return {
|
| 71 |
+
"model_name": model_name,
|
| 72 |
+
"pipeline": pipeline,
|
| 73 |
+
"rmse": rmse,
|
| 74 |
+
"mae": mae,
|
| 75 |
+
"r2": r2,
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def run_experiment(data_path: str, experiment_name: str = "getaround_pricing") -> str:
|
| 80 |
+
"""Run full training experiment, return best model path.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
data_path: Path to the CSV data file.
|
| 84 |
+
experiment_name: MLflow experiment name.
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
Path to the saved best model.
|
| 88 |
+
"""
|
| 89 |
+
mlflow.set_experiment(experiment_name)
|
| 90 |
+
|
| 91 |
+
logger.info("Loading data from %s", data_path)
|
| 92 |
+
df = load_data(data_path)
|
| 93 |
+
|
| 94 |
+
logger.info("Preparing features...")
|
| 95 |
+
X, y = prepare_features(df)
|
| 96 |
+
|
| 97 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 98 |
+
X, y, test_size=0.2, random_state=42
|
| 99 |
+
)
|
| 100 |
+
logger.info(
|
| 101 |
+
"Train/test split: %d train samples, %d test samples",
|
| 102 |
+
len(X_train),
|
| 103 |
+
len(X_test),
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
preprocessor = create_preprocessor()
|
| 107 |
+
results = []
|
| 108 |
+
|
| 109 |
+
for model_name, model in MODELS.items():
|
| 110 |
+
with mlflow.start_run(run_name=model_name):
|
| 111 |
+
result = train_and_evaluate(
|
| 112 |
+
X_train, X_test, y_train, y_test, model_name, model, preprocessor
|
| 113 |
+
)
|
| 114 |
+
results.append(result)
|
| 115 |
+
|
| 116 |
+
mlflow.log_param("model_name", model_name)
|
| 117 |
+
mlflow.log_metric("rmse", result["rmse"])
|
| 118 |
+
mlflow.log_metric("mae", result["mae"])
|
| 119 |
+
mlflow.log_metric("r2", result["r2"])
|
| 120 |
+
|
| 121 |
+
mlflow.sklearn.log_model(result["pipeline"], "model")
|
| 122 |
+
|
| 123 |
+
best_result = min(results, key=lambda x: x["rmse"])
|
| 124 |
+
logger.info(
|
| 125 |
+
"Best model: %s with RMSE=%.2f",
|
| 126 |
+
best_result["model_name"],
|
| 127 |
+
best_result["rmse"],
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
| 131 |
+
model_path = MODELS_DIR / "best_model.joblib"
|
| 132 |
+
joblib.dump(best_result["pipeline"], model_path)
|
| 133 |
+
logger.info("Best model saved to %s", model_path)
|
| 134 |
+
|
| 135 |
+
return str(model_path)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
if __name__ == "__main__":
|
| 139 |
+
import sys
|
| 140 |
+
|
| 141 |
+
logging.basicConfig(level=logging.INFO)
|
| 142 |
+
data_path = (
|
| 143 |
+
sys.argv[1] if len(sys.argv) > 1 else "data/get_around_pricing_project.csv"
|
| 144 |
+
)
|
| 145 |
+
run_experiment(data_path)
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|