Spaces:

sam-bot
/

get-around-api

Sleeping

App Files Files Community

sam-bot commited on Feb 20

Commit

7d87fe9

0 Parent(s):

1st commit

Browse files

Files changed (22) hide show

.gitattributes +35 -0
.gitignore +22 -0
Dockerfile +46 -0
README.md +123 -0
models/best_model.joblib +3 -0
pyproject.toml +50 -0
src/__init__.py +0 -0
src/api/__init__.py +0 -0
src/api/main.py +106 -0
src/api/routers/__init__.py +0 -0
src/api/routers/predict.py +49 -0
src/api/schemas/__init__.py +0 -0
src/api/schemas/prediction.py +57 -0
src/config/__init__.py +0 -0
src/config/settings.py +119 -0
src/dashboard/__init__.py +0 -0
src/dashboard/app.py +765 -0
src/ml/__init__.py +0 -0
src/ml/predict.py +95 -0
src/ml/preprocessing.py +134 -0
src/ml/train.py +145 -0
uv.lock +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,22 @@

+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+.venv/
+# Environment
+.env
+# IDE
+.vscode/
+.idea/
+.DS_Store
+# Data
+data/
+# Claude Code
+.claude/
+# Jupyter
+.ipynb_checkpoints/

Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+# Build stage
+FROM python:3.11-slim AS builder
+WORKDIR /app
+# Install uv
+RUN pip install --no-cache-dir uv
+# Copy dependency files
+COPY pyproject.toml uv.lock ./
+# Install production dependencies only (no dev group = no mlflow)
+RUN uv sync --no-group dev --frozen
+# Production stage
+FROM python:3.11-slim AS production
+WORKDIR /app
+# Create non-root user (HF Spaces requirement - uid 1000)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH="/home/user/.local/bin:/app/.venv/bin:$PATH" \
+    PYTHONPATH="/app" \
+    ENVIRONMENT=production \
+    LOG_LEVEL=INFO
+# Copy virtual environment from builder
+COPY --from=builder --chown=user /app/.venv /app/.venv
+# Copy application code (only API-relevant modules)
+COPY --chown=user src/__init__.py ./src/__init__.py
+COPY --chown=user src/api/ ./src/api/
+COPY --chown=user src/ml/ ./src/ml/
+COPY --chown=user src/config/ ./src/config/
+# Copy trained model
+COPY --chown=user models/ ./models/
+# HF Spaces default port
+EXPOSE 7860
+# Run the API
+CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+---
+title: Getaround Pricing API
+colorFrom: blue
+colorTo: green
+sdk: docker
+app_port: 7860
+pinned: false
+---
+# Getaround Pricing API
+> ML-powered FastAPI service that predicts optimal daily rental prices for cars on the Getaround platform. Takes 13 car features as input, returns a predicted price in EUR.
+## Demo
+Live API: [https://sam-bot-getaround-api.hf.space](https://sam-bot-getaround-api.hf.space)
+Swagger UI: [https://sam-bot-getaround-api.hf.space/docs](https://sam-bot-getaround-api.hf.space/docs)
+## Key Results
+**Endpoints:**
+| Method | Path | Description |
+|--------|------|-------------|
+| POST | `/predict` | Predict rental prices for 1-50 cars |
+| GET | `/health` | Health check with model status |
+| GET | `/docs` | Swagger UI documentation |
+**Model selection:** Three models trained (LinearRegression, RandomForestRegressor, GradientBoostingRegressor) with MLflow tracking. Best model selected by lowest RMSE (RandomForest: RMSE = 16.75, R² = 0.734), saved as a scikit-learn Pipeline (preprocessor + model).
+## Tech Stack
+| Category | Technology |
+|----------|------------|
+| Web Framework | FastAPI, Uvicorn |
+| ML | scikit-learn, MLflow (training) |
+| Data | pandas, numpy |
+| Validation | Pydantic |
+| Serialization | joblib, Git LFS (model storage) |
+| Deployment | Docker |
+| Package Manager | uv |
+## Installation
+The model file (`models/best_model.joblib`, ~32MB) is tracked with Git LFS. Pull it before running:
+```bash
+git clone <repo-url>
+cd getarround_api
+git lfs install
+git lfs pull
+uv sync
+```
+## Usage
+**Run locally:**
+```bash
+uv run uvicorn src.api.main:app --host 0.0.0.0 --port 8000 --reload
+```
+**Run with Docker:**
+```bash
+docker build -t getarround-api .
+docker run -p 7860:7860 getarround-api
+```
+**Example request:**
+```bash
+curl -X POST http://localhost:8000/predict \
+  -H "Content-Type: application/json" \
+  -d '{
+    "cars": [{
+      "model_key": "Peugeot",
+      "mileage": 50000,
+      "engine_power": 120,
+      "fuel": "diesel",
+      "paint_color": "black",
+      "car_type": "sedan",
+      "private_parking_available": true,
+      "has_gps": true,
+      "has_air_conditioning": true,
+      "automatic_car": false,
+      "has_getaround_connect": false,
+      "has_speed_regulator": true,
+      "winter_tires": false
+    }]
+  }'
+```
+**Example response:**
+```json
+{
+  "prediction": [124]
+}
+```
+## Data
+**Model file:** `models/best_model.joblib` (~32MB, tracked via Git LFS). Contains a full scikit-learn Pipeline (preprocessing + trained estimator).
+**Input features (13):**
+| Feature | Type | Description |
+|---------|------|-------------|
+| `model_key` | string | Car brand (e.g., "Peugeot", "BMW", "Citroen") |
+| `mileage` | int | Mileage in km |
+| `engine_power` | int | Engine power in HP |
+| `fuel` | string | Fuel type ("diesel", "petrol", "hybrid_petrol", "electro") |
+| `paint_color` | string | Car color |
+| `car_type` | string | Car type ("sedan", "hatchback", "suv", "van", etc.) |
+| `private_parking_available` | bool | Has private parking |
+| `has_gps` | bool | Has GPS |
+| `has_air_conditioning` | bool | Has air conditioning |
+| `automatic_car` | bool | Automatic transmission |
+| `has_getaround_connect` | bool | Has Getaround Connect |
+| `has_speed_regulator` | bool | Has cruise control |
+| `winter_tires` | bool | Has winter tires |

models/best_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3645ccff2cb16af002a9fc6634c17bd517f463e261f30050ef62af3b58204d4
+size 32840695

pyproject.toml ADDED Viewed

	@@ -0,0 +1,50 @@

+[project]
+name = "getarround"
+version = "0.1.0"
+description = "Getaround delay analysis dashboard and pricing optimization API"
+readme = "README.md"
+requires-python = ">=3.11,<3.12"
+dependencies = [
+    "fastapi~=0.115.0",
+    "uvicorn[standard]~=0.30.0",
+    "streamlit~=1.40.0",
+    "scikit-learn~=1.5.0",
+    "pandas~=2.2.0",
+    "openpyxl~=3.1.0",
+    "pydantic~=2.10.0",
+    "pydantic-settings~=2.10.0",
+    "numpy~=1.26.0",
+    "plotly~=5.24.0",
+    "httpx~=0.27.0",
+]
+[dependency-groups]
+dev = [
+    "ruff",
+    "pytest",
+    "pytest-cov",
+    "pre-commit",
+    "jupyter",
+    "ipykernel",
+    "factory-boy>=3.3.3",
+    "mlflow~=2.19.0",
+]
+[tool.ruff]
+line-length = 88
+target-version = "py311"
+[tool.ruff.lint]
+select = ["E", "F", "I", "W"]
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+addopts = "-v"
+markers = [
+    "integration: integration tests requiring model file and data",
+]

src/__init__.py ADDED Viewed

File without changes

src/api/__init__.py ADDED Viewed

File without changes

src/api/main.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""FastAPI application for Getaround pricing API."""
+import logging
+from contextlib import asynccontextmanager
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from src.api.routers.predict import router as predict_router
+from src.config.settings import configure_logging, get_settings
+from src.ml.predict import get_predictor
+settings = get_settings()
+configure_logging(settings)
+logger = logging.getLogger(__name__)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Manage application startup and shutdown.
+    Args:
+        app: The FastAPI application instance.
+    """
+    # Startup: preload model
+    logger.info("Starting up - preloading model")
+    try:
+        get_predictor()
+        logger.info("Model loaded successfully")
+    except Exception as e:
+        logger.warning("Model not available at startup: %s", e)
+    yield
+    # Shutdown
+    logger.info("Shutting down")
+app = FastAPI(
+    title="Getaround Pricing API",
+    description="""
+API for predicting optimal rental prices for cars.
+## Endpoints
+- **POST /predict**: Predict rental prices based on car features
+- **GET /health**: Health check endpoint
+## Usage
+Send a POST request to `/predict` with car features:
+```json
+{
+    "cars": [{
+        "model_key": "Citroen",
+        "mileage": 100000,
+        "engine_power": 120,
+        "fuel": "diesel",
+        "paint_color": "black",
+        "car_type": "sedan",
+        "private_parking_available": true,
+        "has_gps": true,
+        "has_air_conditioning": true,
+        "automatic_car": false,
+        "has_getaround_connect": false,
+        "has_speed_regulator": true,
+        "winter_tires": false
+    }]
+}
+```
+Response:
+```json
+{
+    "prediction": [124]
+}
+```
+""",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+    lifespan=lifespan,
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.include_router(predict_router)
+@app.get("/health", tags=["health"])
+async def health_check() -> dict:
+    """Health check endpoint.
+    Returns:
+        Status dictionary with model availability.
+    """
+    from src.ml.predict import _predictor_instance
+    model_loaded = _predictor_instance is not None
+    return {"status": "healthy", "model_loaded": model_loaded}

src/api/routers/__init__.py ADDED Viewed

File without changes

src/api/routers/predict.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Prediction router for pricing API."""
+import logging
+from fastapi import APIRouter, HTTPException
+from src.api.schemas.prediction import PredictionInput, PredictionOutput
+from src.ml.predict import ModelNotFoundError, get_predictor
+logger = logging.getLogger(__name__)
+router = APIRouter(tags=["prediction"])
+@router.post(
+    "/predict",
+    response_model=PredictionOutput,
+    summary="Predict rental prices",
+    description="Predict optimal rental prices for cars based on their features.",
+)
+async def predict(data: PredictionInput) -> PredictionOutput:
+    """Predict rental prices from car features.
+    Args:
+        data: Input containing list of car features.
+    Returns:
+        Predictions with list of predicted prices in EUR.
+    Raises:
+        HTTPException: If model is not available or prediction fails.
+    """
+    logger.info("Received prediction request for %d cars", len(data.cars))
+    try:
+        predictor = get_predictor()
+        cars_dict = [car.model_dump() for car in data.cars]
+        predictions = predictor.predict_from_features(cars_dict)
+        logger.info("Predictions completed: %s", predictions)
+        return PredictionOutput(prediction=predictions)
+    except ModelNotFoundError as e:
+        logger.error("Model not found: %s", e)
+        raise HTTPException(status_code=503, detail="Model not available") from e
+    except Exception as e:
+        logger.error("Prediction failed: %s", e)
+        raise HTTPException(
+            status_code=500,
+            detail="Internal prediction error",
+        ) from e

src/api/schemas/__init__.py ADDED Viewed

File without changes

src/api/schemas/prediction.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""Pydantic schemas for prediction API."""
+from pydantic import BaseModel, Field
+class CarFeatures(BaseModel):
+    """Features for a single car rental price prediction.
+    Attributes:
+        model_key: Car brand/model (e.g., "Citroen", "Peugeot", "BMW").
+        mileage: Car mileage in kilometers.
+        engine_power: Engine power in horsepower.
+        fuel: Fuel type ("diesel", "petrol", "hybrid_petrol", "electro").
+        paint_color: Car color.
+        car_type: Type of car ("sedan", "hatchback", "suv", "van", etc.).
+        private_parking_available: Has private parking.
+        has_gps: Has GPS.
+        has_air_conditioning: Has air conditioning.
+        automatic_car: Is automatic transmission.
+        has_getaround_connect: Has Getaround Connect feature.
+        has_speed_regulator: Has speed regulator/cruise control.
+        winter_tires: Has winter tires.
+    """
+    model_key: str = Field(..., examples=["Citroen"])
+    mileage: int = Field(..., ge=0, examples=[100000])
+    engine_power: int = Field(..., ge=0, examples=[120])
+    fuel: str = Field(..., examples=["diesel"])
+    paint_color: str = Field(..., examples=["black"])
+    car_type: str = Field(..., examples=["sedan"])
+    private_parking_available: bool = Field(default=False)
+    has_gps: bool = Field(default=False)
+    has_air_conditioning: bool = Field(default=False)
+    automatic_car: bool = Field(default=False)
+    has_getaround_connect: bool = Field(default=False)
+    has_speed_regulator: bool = Field(default=False)
+    winter_tires: bool = Field(default=False)
+class PredictionInput(BaseModel):
+    """Input schema for /predict endpoint."""
+    cars: list[CarFeatures] = Field(
+        ...,
+        description="List of cars to predict prices for",
+        min_length=1,
+        max_length=50,
+    )
+class PredictionOutput(BaseModel):
+    """Output schema for /predict endpoint."""
+    prediction: list[int] = Field(
+        ...,
+        description="List of predicted rental prices per day in EUR",
+    )

src/config/__init__.py ADDED Viewed

File without changes

src/config/settings.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""Application settings using Pydantic Settings."""
+import logging
+from enum import Enum
+from functools import lru_cache
+from pathlib import Path
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Environment(str, Enum):
+    """Application environment."""
+    DEVELOPMENT = "development"
+    TESTING = "testing"
+    PRODUCTION = "production"
+class Settings(BaseSettings):
+    """Application settings loaded from environment variables.
+    Settings are loaded from .env files and environment variables.
+    Environment variables take precedence over .env file values.
+    Attributes:
+        environment: Current environment (development, testing, production).
+        log_level: Logging level as string (DEBUG, INFO, WARNING, ERROR).
+        api_url: Base URL for the pricing prediction API.
+        api_host: Host for the FastAPI server.
+        api_port: Port for the FastAPI server.
+        dashboard_port: Port for the Streamlit dashboard.
+        mlflow_tracking_uri: URI for MLflow tracking server.
+        data_dir: Directory for data files.
+        models_dir: Directory for serialized models.
+    """
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+    )
+    # Environment
+    environment: Environment = Environment.DEVELOPMENT
+    # Logging
+    log_level: str = "DEBUG"
+    # API
+    api_url: str = "http://localhost:8000"
+    api_host: str = "0.0.0.0"
+    api_port: int = 8000
+    # Dashboard
+    dashboard_port: int = 8501
+    # MLflow
+    mlflow_tracking_uri: str = "./mlruns"
+    # Paths
+    data_dir: Path = Path("data")
+    models_dir: Path = Path("models")
+    @property
+    def log_level_int(self) -> int:
+        """Get logging level as integer.
+        Returns:
+            Logging level constant (e.g., logging.DEBUG, logging.INFO).
+        """
+        return getattr(logging, self.log_level.upper(), logging.INFO)
+# Logging config by environment (from ORCHESTRATION.md)
+LOGGING_CONFIG: dict[Environment, dict[str, int | str]] = {
+    Environment.DEVELOPMENT: {
+        "level": logging.DEBUG,
+        "format": "%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s",
+    },
+    Environment.TESTING: {
+        "level": logging.WARNING,
+        "format": "%(levelname)s - %(message)s",
+    },
+    Environment.PRODUCTION: {
+        "level": logging.INFO,
+        "format": "%(asctime)s - %(levelname)s - %(message)s",
+    },
+}
+def configure_logging(settings: Settings) -> None:
+    """Configure logging based on environment.
+    Sets up the root logger with appropriate level and format based on
+    the current environment.
+    Args:
+        settings: Application settings instance.
+    """
+    config = LOGGING_CONFIG.get(
+        settings.environment, LOGGING_CONFIG[Environment.DEVELOPMENT]
+    )
+    logging.basicConfig(
+        level=config["level"],
+        format=config["format"],
+    )
+@lru_cache
+def get_settings() -> Settings:
+    """Get cached settings instance.
+    Uses lru_cache to ensure settings are loaded only once and reused
+    across the application.
+    Returns:
+        Cached Settings instance.
+    """
+    return Settings()

src/dashboard/__init__.py ADDED Viewed

File without changes

src/dashboard/app.py ADDED Viewed

	@@ -0,0 +1,765 @@

+"""Getaround Delay Analysis Dashboard.
+Interactive dashboard to help PM decide on minimum delay threshold between rentals.
+Answers key questions about delay impact and revenue implications.
+Includes a pricing prediction section powered by the ML API.
+"""
+import logging
+from pathlib import Path
+import httpx
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import streamlit as st
+from plotly.subplots import make_subplots
+from src.config.settings import get_settings
+# Setup
+DATA_PATH = (
+    Path(__file__).parent.parent.parent / "data" / "get_around_delay_analysis.csv"
+)
+logger = logging.getLogger(__name__)
+# Page config
+st.set_page_config(
+    page_title="Getaround Delay Analysis",
+    page_icon="GA",
+    layout="wide",
+)
+st.title("Getaround Delay Analysis Dashboard")
+st.markdown("*Helping PM decide on minimum delay threshold between rentals*")
+@st.cache_data
+def load_data() -> pd.DataFrame:
+    """Load and cache the delay analysis data.
+    Returns:
+        DataFrame with rental delay data.
+    """
+    return pd.read_csv(DATA_PATH)
+@st.cache_data
+def prepare_data(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    """Prepare derived datasets for analysis.
+    Args:
+        df: Raw rental data.
+    Returns:
+        Tuple of (ended_rentals, consecutive_rentals, consecutive_with_delay).
+    """
+    # Ended rentals with delay info
+    ended = df[df["state"] == "ended"].copy()
+    ended["is_late"] = ended["delay_at_checkout_in_minutes"] > 0
+    # Rentals with a previous rental on the same car
+    consecutive = df[df["previous_ended_rental_id"].notna()].copy()
+    # Merge to get previous rental delay info
+    prev_delays = df[["rental_id", "delay_at_checkout_in_minutes"]].copy()
+    prev_delays.columns = ["previous_ended_rental_id", "previous_delay"]
+    consecutive = consecutive.merge(
+        prev_delays, on="previous_ended_rental_id", how="left"
+    )
+    # Calculate effective time and problematic flag
+    consecutive["effective_time"] = consecutive[
+        "time_delta_with_previous_rental_in_minutes"
+    ] - consecutive["previous_delay"].fillna(0)
+    consecutive["is_problematic"] = consecutive["effective_time"] < 0
+    return ended, consecutive, consecutive[consecutive["is_problematic"]]
+def simulate_threshold(data: pd.DataFrame, threshold_minutes: int, scope: str) -> dict:
+    """Simulate the impact of a minimum delay threshold.
+    Args:
+        data: DataFrame with consecutive rentals.
+        threshold_minutes: Minimum required gap between rentals.
+        scope: 'All rentals', 'Connect only', or 'Mobile only'.
+    Returns:
+        Dict with simulation results.
+    """
+    if scope == "Connect only":
+        subset = data[data["checkin_type"] == "connect"].copy()
+    elif scope == "Mobile only":
+        subset = data[data["checkin_type"] == "mobile"].copy()
+    else:
+        subset = data.copy()
+    if len(subset) == 0:
+        return {
+            "affected_rentals": 0,
+            "affected_pct": 0.0,
+            "total_problematic": 0,
+            "solved_cases": 0,
+            "solved_pct": 0.0,
+            "unsolved_cases": 0,
+        }
+    # Rentals affected (scheduled gap < threshold)
+    affected = subset[
+        subset["time_delta_with_previous_rental_in_minutes"] < threshold_minutes
+    ]
+    # Problematic cases in subset
+    problematic = subset[subset["is_problematic"]]
+    # Solved = problematic cases with time_delta < threshold
+    solved = problematic[
+        problematic["time_delta_with_previous_rental_in_minutes"] < threshold_minutes
+    ]
+    return {
+        "affected_rentals": len(affected),
+        "affected_pct": len(affected) / len(subset) * 100 if len(subset) > 0 else 0,
+        "total_problematic": len(problematic),
+        "solved_cases": len(solved),
+        "solved_pct": len(solved) / len(problematic) * 100
+        if len(problematic) > 0
+        else 0,
+        "unsolved_cases": len(problematic) - len(solved),
+    }
+# Load data
+try:
+    df = load_data()
+except FileNotFoundError:
+    st.error("Data file not found. Ensure data/get_around_delay_analysis.csv exists.")
+    st.stop()
+except Exception as e:
+    st.error(f"Failed to load data: {e}")
+    st.stop()
+ended, consecutive, problematic_cases = prepare_data(df)
+# Sidebar - Controls
+st.sidebar.header("Configuration")
+scope = st.sidebar.selectbox(
+    "Scope",
+    options=["All rentals", "Connect only", "Mobile only"],
+    help="Which rental types to include in analysis",
+)
+threshold_minutes = st.sidebar.slider(
+    "Minimum delay threshold (minutes)",
+    min_value=0,
+    max_value=720,
+    value=60,
+    step=15,
+    help="Proposed minimum time between consecutive rentals",
+)
+# Filter data based on scope
+if scope == "Connect only":
+    df_filtered = df[df["checkin_type"] == "connect"]
+    ended_filtered = ended[ended["checkin_type"] == "connect"]
+    consecutive_filtered = consecutive[consecutive["checkin_type"] == "connect"]
+elif scope == "Mobile only":
+    df_filtered = df[df["checkin_type"] == "mobile"]
+    ended_filtered = ended[ended["checkin_type"] == "mobile"]
+    consecutive_filtered = consecutive[consecutive["checkin_type"] == "mobile"]
+else:
+    df_filtered = df
+    ended_filtered = ended
+    consecutive_filtered = consecutive
+# =============================================================================
+# Section 1: Overview
+# =============================================================================
+st.header("1. Dataset Overview")
+col1, col2, col3, col4 = st.columns(4)
+with col1:
+    st.metric("Total Rentals", f"{len(df_filtered):,}")
+with col2:
+    ended_count = len(df_filtered[df_filtered["state"] == "ended"])
+    ended_pct = ended_count / len(df_filtered) * 100 if len(df_filtered) > 0 else 0
+    st.metric("Completed", f"{ended_count:,}", f"{ended_pct:.1f}%")
+with col3:
+    canceled_count = len(df_filtered[df_filtered["state"] == "canceled"])
+    canceled_pct = (
+        canceled_count / len(df_filtered) * 100 if len(df_filtered) > 0 else 0
+    )
+    st.metric("Canceled", f"{canceled_count:,}", f"{canceled_pct:.1f}%")
+with col4:
+    consecutive_count = len(consecutive_filtered)
+    consecutive_pct = (
+        consecutive_count / len(df_filtered) * 100 if len(df_filtered) > 0 else 0
+    )
+    st.metric(
+        "Consecutive Rentals", f"{consecutive_count:,}", f"{consecutive_pct:.1f}%"
+    )
+# Rental type distribution chart
+if scope == "All rentals":
+    col_chart1, col_chart2 = st.columns(2)
+    with col_chart1:
+        type_counts = df["checkin_type"].value_counts()
+        fig_type = px.pie(
+            values=type_counts.values,
+            names=type_counts.index,
+            title="Rentals by Checkin Type",
+            hole=0.4,
+        )
+        fig_type.update_traces(textinfo="label+percent")
+        st.plotly_chart(fig_type, use_container_width=True)
+    with col_chart2:
+        state_counts = df["state"].value_counts()
+        fig_state = px.pie(
+            values=state_counts.values,
+            names=state_counts.index,
+            title="Rentals by State",
+            hole=0.4,
+        )
+        fig_state.update_traces(textinfo="label+percent")
+        st.plotly_chart(fig_state, use_container_width=True)
+# =============================================================================
+# Section 2: Late Returns Analysis
+# =============================================================================
+st.header("2. Late Returns Analysis")
+# Late return statistics
+delay_data = ended_filtered["delay_at_checkout_in_minutes"].dropna()
+late_count = (delay_data > 0).sum()
+on_time_count = (delay_data <= 0).sum()
+late_pct = late_count / len(delay_data) * 100 if len(delay_data) > 0 else 0
+col1, col2, col3 = st.columns(3)
+with col1:
+    st.metric("Late Returns", f"{late_count:,}", f"{late_pct:.1f}%")
+with col2:
+    median_delay = delay_data.median() if len(delay_data) > 0 else 0
+    st.metric("Median Delay", f"{median_delay:.0f} min")
+with col3:
+    mean_delay = delay_data.mean() if len(delay_data) > 0 else 0
+    st.metric("Mean Delay", f"{mean_delay:.0f} min")
+# Visualizations
+col_viz1, col_viz2 = st.columns(2)
+with col_viz1:
+    # Pie chart: on-time vs late
+    fig_late = px.pie(
+        values=[on_time_count, late_count],
+        names=["On time or early", "Late"],
+        title="Return Timing Distribution",
+        hole=0.4,
+        color_discrete_sequence=["#2ecc71", "#e74c3c"],
+    )
+    fig_late.update_traces(textinfo="label+percent")
+    st.plotly_chart(fig_late, use_container_width=True)
+with col_viz2:
+    # Histogram of delay distribution
+    delay_capped = delay_data.clip(-120, 360)
+    fig_hist = px.histogram(
+        delay_capped,
+        nbins=50,
+        title="Delay Distribution (capped at -2h to +6h)",
+        labels={"value": "Delay (minutes)", "count": "Number of rentals"},
+    )
+    fig_hist.add_vline(x=0, line_dash="dash", line_color="red")
+    fig_hist.update_layout(showlegend=False)
+    st.plotly_chart(fig_hist, use_container_width=True)
+# Stats by rental type (only if showing all)
+if scope == "All rentals":
+    st.subheader("Late Returns by Checkin Type")
+    late_by_type = ended.groupby("checkin_type").agg(
+        total=("rental_id", "count"),
+        late_count=("is_late", "sum"),
+        mean_delay=("delay_at_checkout_in_minutes", "mean"),
+        median_delay=("delay_at_checkout_in_minutes", "median"),
+    )
+    late_by_type["late_rate"] = late_by_type["late_count"] / late_by_type["total"] * 100
+    late_by_type = late_by_type.round(1)
+    st.dataframe(
+        late_by_type[
+            ["total", "late_count", "late_rate", "mean_delay", "median_delay"]
+        ],
+        use_container_width=True,
+    )
+# =============================================================================
+# Section 3: Impact on Next Rental
+# =============================================================================
+st.header("3. Impact on Next Driver")
+st.markdown(
+    """
+    Analysis of consecutive rentals on the same car
+    and how delays affect the next driver.
+    """
+)
+# Key metrics
+problematic_in_scope = consecutive_filtered[consecutive_filtered["is_problematic"]]
+prob_count = len(problematic_in_scope)
+prob_pct = (
+    prob_count / len(consecutive_filtered) * 100 if len(consecutive_filtered) > 0 else 0
+)
+col1, col2, col3 = st.columns(3)
+with col1:
+    st.metric(
+        "Consecutive Rentals",
+        f"{len(consecutive_filtered):,}",
+        f"{len(consecutive_filtered) / len(df_filtered) * 100:.1f}% of total"
+        if len(df_filtered) > 0
+        else "0%",
+    )
+with col2:
+    time_delta = consecutive_filtered["time_delta_with_previous_rental_in_minutes"]
+    median_gap = time_delta.median() if len(time_delta) > 0 else 0
+    st.metric("Median Gap Between Rentals", f"{median_gap:.0f} min")
+with col3:
+    st.metric(
+        "Problematic Cases",
+        f"{prob_count:,}",
+        f"{prob_pct:.1f}% of consecutive",
+    )
+# Time between consecutive rentals
+col_viz1, col_viz2 = st.columns(2)
+with col_viz1:
+    fig_delta = px.histogram(
+        time_delta.clip(0, 720),
+        nbins=40,
+        title="Scheduled Gap Between Consecutive Rentals",
+        labels={"value": "Time Delta (minutes)", "count": "Count"},
+    )
+    fig_delta.add_vline(
+        x=threshold_minutes,
+        line_dash="dash",
+        line_color="red",
+        annotation_text=f"Threshold: {threshold_minutes}min",
+    )
+    fig_delta.update_layout(showlegend=False)
+    st.plotly_chart(fig_delta, use_container_width=True)
+with col_viz2:
+    # Wait time for impacted drivers
+    if len(problematic_in_scope) > 0:
+        wait_time = -problematic_in_scope["effective_time"]
+        fig_wait = px.histogram(
+            wait_time.clip(0, 240),
+            nbins=30,
+            title="Wait Time for Impacted Drivers",
+            labels={"value": "Wait Time (minutes)", "count": "Count"},
+            color_discrete_sequence=["#e74c3c"],
+        )
+        fig_wait.update_layout(showlegend=False)
+        st.plotly_chart(fig_wait, use_container_width=True)
+    else:
+        st.info("No problematic cases in the selected scope.")
+# Problematic rate by type
+if scope == "All rentals":
+    st.subheader("Problematic Cases by Checkin Type")
+    prob_by_type = consecutive.groupby("checkin_type").agg(
+        total=("rental_id", "count"),
+        problematic=("is_problematic", "sum"),
+    )
+    prob_by_type["rate_pct"] = (
+        prob_by_type["problematic"] / prob_by_type["total"] * 100
+    ).round(1)
+    st.dataframe(prob_by_type, use_container_width=True)
+# =============================================================================
+# Section 4: Threshold Simulation
+# =============================================================================
+st.header("4. Threshold Impact Simulation")
+st.markdown(
+    f"""
+    Simulating impact of **{threshold_minutes}-minute** threshold on **{scope}**.
+    """
+)
+# Current threshold simulation
+sim_result = simulate_threshold(consecutive, threshold_minutes, scope)
+col1, col2, col3, col4 = st.columns(4)
+with col1:
+    st.metric(
+        "Affected Rentals",
+        f"{sim_result['affected_rentals']:,}",
+        f"{sim_result['affected_pct']:.1f}%",
+    )
+with col2:
+    st.metric(
+        "Total Problematic",
+        f"{sim_result['total_problematic']:,}",
+    )
+with col3:
+    st.metric(
+        "Problems Solved",
+        f"{sim_result['solved_cases']:,}",
+        f"{sim_result['solved_pct']:.1f}%",
+    )
+with col4:
+    st.metric(
+        "Unsolved Cases",
+        f"{sim_result['unsolved_cases']:,}",
+    )
+# Revenue impact estimation
+st.subheader("Revenue Impact Estimation")
+total_rentals = len(df)
+affected_pct_total = sim_result["affected_rentals"] / total_rentals * 100
+st.info(
+    f"""
+    **Share of owner revenue potentially affected:** {affected_pct_total:.2f}%
+    - Consecutive rentals represent \
+{len(consecutive) / total_rentals * 100:.1f}% of all rentals
+    - With {threshold_minutes}-minute threshold on {scope.lower()}:
+      {sim_result["affected_rentals"]:,} rentals would need to be rescheduled
+    """
+)
+# Comparison table for different thresholds
+st.subheader("Threshold Comparison")
+thresholds = [15, 30, 60, 90, 120, 180, 240, 360]
+comparison_data = []
+for t in thresholds:
+    result = simulate_threshold(consecutive, t, scope)
+    comparison_data.append(
+        {
+            "Threshold (min)": t,
+            "Affected Rentals": result["affected_rentals"],
+            "Affected %": f"{result['affected_pct']:.1f}%",
+            "Problems Solved": result["solved_cases"],
+            "Solved %": f"{result['solved_pct']:.1f}%",
+            "Unsolved": result["unsolved_cases"],
+        }
+    )
+comparison_df = pd.DataFrame(comparison_data)
+st.dataframe(comparison_df, use_container_width=True, hide_index=True)
+# Trade-off visualization
+st.subheader("Trade-off: Affected vs Solved")
+tradeoff_data = []
+for t in thresholds:
+    for s in ["All rentals", "Connect only"]:
+        result = simulate_threshold(consecutive, t, s)
+        tradeoff_data.append(
+            {
+                "threshold": t,
+                "scope": s,
+                "affected_pct": result["affected_pct"],
+                "solved_pct": result["solved_pct"],
+            }
+        )
+tradeoff_df = pd.DataFrame(tradeoff_data)
+fig_tradeoff = make_subplots(
+    rows=1,
+    cols=2,
+    subplot_titles=["All Rentals", "Connect Only"],
+)
+for i, s in enumerate(["All rentals", "Connect only"]):
+    scope_data = tradeoff_df[tradeoff_df["scope"] == s]
+    fig_tradeoff.add_trace(
+        go.Scatter(
+            x=scope_data["threshold"],
+            y=scope_data["affected_pct"],
+            name="Affected %",
+            mode="lines+markers",
+            line=dict(color="#e74c3c"),
+            showlegend=(i == 0),
+        ),
+        row=1,
+        col=i + 1,
+    )
+    fig_tradeoff.add_trace(
+        go.Scatter(
+            x=scope_data["threshold"],
+            y=scope_data["solved_pct"],
+            name="Solved %",
+            mode="lines+markers",
+            line=dict(color="#2ecc71"),
+            showlegend=(i == 0),
+        ),
+        row=1,
+        col=i + 1,
+    )
+fig_tradeoff.update_xaxes(title_text="Threshold (minutes)")
+fig_tradeoff.update_yaxes(title_text="Percentage")
+fig_tradeoff.update_layout(height=400)
+st.plotly_chart(fig_tradeoff, use_container_width=True)
+# =============================================================================
+# Section 5: Recommendations
+# =============================================================================
+st.header("5. Key Findings and Recommendations")
+st.markdown(
+    """
+    ### Data Insights
+    | Metric | Value |
+    |--------|-------|
+    | Total rentals | {:,} |
+    | Mobile checkin share | {:.1f}% |
+    | Connect checkin share | {:.1f}% |
+    | Late return rate | {:.1f}% |
+    | Consecutive rentals | {:.1f}% of total |
+    | Problematic cases | {:.1f}% of consecutive |
+    ### Recommendations
+    | Strategy | Threshold | Scope | Trade-off |
+    |----------|-----------|-------|-----------|
+    | Conservative | 60 min | Connect | ~45% solved, low impact |
+    | Balanced | 120 min | Connect | ~70% solved, moderate |
+    | Aggressive | 180 min | All | ~85% solved, high impact |
+    ### Key Takeaways
+    1. **Connect-only scope is safer**: Affects only 20% of rentals while addressing
+       higher problematic rate (12% vs 8%)
+    2. **60-minute threshold is a good starting point**: Minimal revenue impact with
+       meaningful improvement in driver experience
+    3. **Problematic cases are relatively rare**: Only ~1.5% of all rentals are affected
+       by previous rental delays
+    4. **Mobile rentals have more volume but lower problematic rate**: Consider phased
+       rollout starting with Connect
+    """.format(
+        len(df),
+        len(df[df["checkin_type"] == "mobile"]) / len(df) * 100,
+        len(df[df["checkin_type"] == "connect"]) / len(df) * 100,
+        (ended["is_late"].sum() / len(ended) * 100),
+        len(consecutive) / len(df) * 100,
+        len(problematic_cases) / len(consecutive) * 100 if len(consecutive) > 0 else 0,
+    )
+)
+# =============================================================================
+# Section 6: Pricing Prediction
+# =============================================================================
+st.header("6. Pricing Prediction")
+st.markdown(
+    "Use the form below to estimate the daily rental price "
+    "for a car based on its features."
+)
+settings = get_settings()
+api_url = settings.api_url
+st.markdown(f"API Documentation: [{api_url}/docs]({api_url}/docs)")
+with st.form("prediction_form"):
+    # Row 1 - Main characteristics
+    r1c1, r1c2, r1c3 = st.columns(3)
+    with r1c1:
+        model_key = st.selectbox(
+            "Brand",
+            options=[
+                "Alfa Romeo",
+                "Audi",
+                "BMW",
+                "Citroen",
+                "Fiat",
+                "Ford",
+                "KIA",
+                "Lamborghini",
+                "Lexus",
+                "Maserati",
+                "Mercedes",
+                "Mini",
+                "Mitsubishi",
+                "Nissan",
+                "Opel",
+                "PGO",
+                "Peugeot",
+                "Porsche",
+                "Renault",
+                "SEAT",
+                "Subaru",
+                "Suzuki",
+                "Toyota",
+                "Volkswagen",
+                "Yamaha",
+            ],
+            index=16,  # Peugeot
+        )
+    with r1c2:
+        fuel = st.selectbox(
+            "Fuel",
+            options=[
+                "diesel",
+                "petrol",
+                "hybrid_petrol",
+                "electro",
+            ],
+            index=0,
+        )
+    with r1c3:
+        car_type = st.selectbox(
+            "Car Type",
+            options=[
+                "sedan",
+                "hatchback",
+                "suv",
+                "van",
+                "estate",
+                "convertible",
+                "coupe",
+                "subcompact",
+            ],
+            index=0,
+        )
+    # Row 2 - Specifications
+    r2c1, r2c2, r2c3 = st.columns(3)
+    with r2c1:
+        mileage = st.number_input(
+            "Mileage (km)",
+            min_value=0,
+            max_value=500_000,
+            value=100_000,
+            step=5_000,
+        )
+    with r2c2:
+        engine_power = st.number_input(
+            "Engine Power (hp)",
+            min_value=10,
+            max_value=500,
+            value=120,
+            step=10,
+        )
+    with r2c3:
+        paint_color = st.selectbox(
+            "Paint Color",
+            options=[
+                "black",
+                "white",
+                "grey",
+                "silver",
+                "blue",
+                "red",
+                "beige",
+                "brown",
+                "green",
+                "orange",
+            ],
+            index=0,
+        )
+    # Row 3 - Equipment
+    st.markdown("**Equipment**")
+    r3c1, r3c2, r3c3, r3c4 = st.columns(4)
+    with r3c1:
+        private_parking = st.checkbox("Private parking", value=False)
+        has_gps = st.checkbox("GPS", value=False)
+    with r3c2:
+        has_ac = st.checkbox("Air conditioning", value=True)
+        automatic = st.checkbox("Automatic transmission", value=False)
+    with r3c3:
+        has_connect = st.checkbox("Getaround Connect", value=False)
+        has_regulator = st.checkbox("Speed regulator", value=False)
+    with r3c4:
+        winter_tires = st.checkbox("Winter tires", value=False)
+    submitted = st.form_submit_button("Predict Price")
+if submitted:
+    payload = {
+        "cars": [
+            {
+                "model_key": model_key,
+                "mileage": mileage,
+                "engine_power": engine_power,
+                "fuel": fuel,
+                "paint_color": paint_color,
+                "car_type": car_type,
+                "private_parking_available": private_parking,
+                "has_gps": has_gps,
+                "has_air_conditioning": has_ac,
+                "automatic_car": automatic,
+                "has_getaround_connect": has_connect,
+                "has_speed_regulator": has_regulator,
+                "winter_tires": winter_tires,
+            }
+        ]
+    }
+    try:
+        response = httpx.post(
+            f"{api_url}/predict",
+            json=payload,
+            timeout=10.0,
+        )
+        response.raise_for_status()
+        result = response.json()
+        price = result["prediction"][0]
+        st.success(f"Estimated daily rental price: {price} EUR/day")
+    except httpx.ConnectError:
+        st.error(
+            f"Cannot connect to API at {api_url}. Ensure the API server is running."
+        )
+    except httpx.HTTPStatusError as exc:
+        st.error(f"API error (HTTP {exc.response.status_code}): {exc.response.text}")
+    except Exception as exc:
+        logger.exception("Prediction request failed")
+        st.error(f"Prediction failed: {exc}")
+# Footer
+st.markdown("---")
+st.caption("Dashboard built for Getaround PM team")

src/ml/__init__.py ADDED Viewed

File without changes

src/ml/predict.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Model inference for pricing predictions."""
+import logging
+from pathlib import Path
+from typing import Union
+import joblib
+import pandas as pd
+logger = logging.getLogger(__name__)
+DEFAULT_MODEL_PATH = (
+    Path(__file__).parent.parent.parent / "models" / "best_model.joblib"
+)
+_predictor_instance: "PricingPredictor | None" = None
+class ModelNotFoundError(Exception):
+    """Raised when the model file cannot be found."""
+class PricingPredictor:
+    """Predictor class for car rental pricing."""
+    def __init__(self, model_path: Union[str, Path] = DEFAULT_MODEL_PATH) -> None:
+        """Initialize predictor with trained model.
+        Args:
+            model_path: Path to the trained model file.
+        Raises:
+            ModelNotFoundError: If the model file does not exist.
+        """
+        self.model_path = Path(model_path)
+        self.model = None
+        self._load_model()
+    def _load_model(self) -> None:
+        """Load model from disk.
+        Raises:
+            ModelNotFoundError: If the model file does not exist.
+        """
+        if not self.model_path.exists():
+            logger.error("Model file not found: %s", self.model_path)
+            raise ModelNotFoundError(f"Model not found at {self.model_path}")
+        logger.info("Loading model from %s", self.model_path)
+        self.model = joblib.load(self.model_path)
+        logger.info("Model loaded successfully")
+    def predict_from_dict(self, data: dict) -> list[int]:
+        """Make predictions from dictionary input.
+        Args:
+            data: Dictionary with feature names as keys and list values.
+        Returns:
+            List of predicted prices.
+        """
+        df = pd.DataFrame(data)
+        predictions = self.model.predict(df)
+        return [int(round(p)) for p in predictions]
+    def predict_from_features(self, cars: list[dict]) -> list[int]:
+        """Make predictions from car feature dictionaries.
+        Args:
+            cars: List of dictionaries with car features.
+        Returns:
+            List of predicted prices (rounded to int).
+        """
+        logger.debug("Predicting for %d cars", len(cars))
+        df = pd.DataFrame(cars)
+        predictions = self.model.predict(df)
+        return [int(round(p)) for p in predictions]
+def get_predictor(
+    model_path: Union[str, Path] = DEFAULT_MODEL_PATH,
+) -> PricingPredictor:
+    """Get singleton predictor instance.
+    Args:
+        model_path: Path to the trained model file.
+    Returns:
+        Singleton PricingPredictor instance.
+    """
+    global _predictor_instance
+    if _predictor_instance is None:
+        _predictor_instance = PricingPredictor(model_path)
+    return _predictor_instance

src/ml/preprocessing.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""Feature engineering and preprocessing for pricing model."""
+import logging
+from typing import Tuple
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+logger = logging.getLogger(__name__)
+# Feature definitions based on EDA
+CATEGORICAL_FEATURES = [
+    "model_key",
+    "fuel",
+    "paint_color",
+    "car_type",
+]
+BOOLEAN_FEATURES = [
+    "private_parking_available",
+    "has_gps",
+    "has_air_conditioning",
+    "automatic_car",
+    "has_getaround_connect",
+    "has_speed_regulator",
+    "winter_tires",
+]
+NUMERICAL_FEATURES = [
+    "mileage",
+    "engine_power",
+]
+TARGET = "rental_price_per_day"
+def load_data(filepath: str) -> pd.DataFrame:
+    """Load pricing dataset from CSV.
+    Args:
+        filepath: Path to the CSV file.
+    Returns:
+        DataFrame with loaded data.
+    Raises:
+        FileNotFoundError: If file does not exist.
+        pd.errors.ParserError: If CSV parsing fails.
+    """
+    logger.info("Loading data from %s", filepath)
+    df = pd.read_csv(filepath, index_col=0)
+    logger.info("Loaded %d rows, %d columns", df.shape[0], df.shape[1])
+    return df
+def create_preprocessor() -> ColumnTransformer:
+    """Create sklearn preprocessor for features.
+    The preprocessor applies:
+    - StandardScaler to numerical features (mileage, engine_power)
+    - Passthrough for boolean features (already 0/1)
+    - OneHotEncoder for categorical features (model_key, fuel, paint_color, car_type)
+    Returns:
+        ColumnTransformer configured for all feature types.
+    """
+    preprocessor = ColumnTransformer(
+        transformers=[
+            (
+                "num",
+                StandardScaler(),
+                NUMERICAL_FEATURES,
+            ),
+            (
+                "bool",
+                "passthrough",
+                BOOLEAN_FEATURES,
+            ),
+            (
+                "cat",
+                OneHotEncoder(handle_unknown="ignore", sparse_output=False),
+                CATEGORICAL_FEATURES,
+            ),
+        ],
+        remainder="drop",
+    )
+    logger.debug(
+        "Created preprocessor with %d transformers", len(preprocessor.transformers)
+    )
+    return preprocessor
+def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
+    """Split dataframe into features X and target y.
+    Converts boolean columns to int (0/1) for sklearn compatibility.
+    Args:
+        df: DataFrame with all columns including target.
+    Returns:
+        Tuple of (X, y) where X is features DataFrame and y is target Series.
+    Raises:
+        KeyError: If required columns are missing.
+    """
+    required_cols = (
+        NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES + [TARGET]
+    )
+    missing_cols = set(required_cols) - set(df.columns)
+    if missing_cols:
+        raise KeyError(f"Missing columns: {missing_cols}")
+    feature_cols = NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES
+    X = df[feature_cols].copy()
+    for col in BOOLEAN_FEATURES:
+        X[col] = X[col].astype(int)
+    y = df[TARGET].copy()
+    logger.info("Prepared features: X shape %s, y shape %s", X.shape, y.shape)
+    return X, y
+def get_feature_names() -> list[str]:
+    """Return list of all feature names used.
+    Returns:
+        List of feature names in order: numerical, boolean, categorical.
+    """
+    return NUMERICAL_FEATURES + BOOLEAN_FEATURES + CATEGORICAL_FEATURES

src/ml/train.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""Model training with MLflow tracking."""
+import logging
+from pathlib import Path
+import joblib
+import mlflow
+import mlflow.sklearn
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from src.ml.preprocessing import (
+    create_preprocessor,
+    load_data,
+    prepare_features,
+)
+logger = logging.getLogger(__name__)
+MODELS = {
+    "linear_regression": LinearRegression(),
+    "random_forest": RandomForestRegressor(n_estimators=100, random_state=42),
+    "gradient_boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
+}
+MODELS_DIR = Path(__file__).parent.parent.parent / "models"
+def train_and_evaluate(
+    X_train: pd.DataFrame,
+    X_test: pd.DataFrame,
+    y_train: pd.Series,
+    y_test: pd.Series,
+    model_name: str,
+    model,
+    preprocessor,
+) -> dict:
+    """Train model and return metrics.
+    Args:
+        X_train: Training features.
+        X_test: Test features.
+        y_train: Training target.
+        y_test: Test target.
+        model_name: Name of the model for logging.
+        model: Sklearn estimator instance.
+        preprocessor: Sklearn preprocessor (ColumnTransformer).
+    Returns:
+        Dictionary with model name, pipeline, and metrics (rmse, mae, r2).
+    """
+    pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])
+    logger.info("Training %s...", model_name)
+    pipeline.fit(X_train, y_train)
+    y_pred = pipeline.predict(X_test)
+    rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
+    mae = float(mean_absolute_error(y_test, y_pred))
+    r2 = float(r2_score(y_test, y_pred))
+    logger.info("%s - RMSE: %.2f, MAE: %.2f, R2: %.3f", model_name, rmse, mae, r2)
+    return {
+        "model_name": model_name,
+        "pipeline": pipeline,
+        "rmse": rmse,
+        "mae": mae,
+        "r2": r2,
+    }
+def run_experiment(data_path: str, experiment_name: str = "getaround_pricing") -> str:
+    """Run full training experiment, return best model path.
+    Args:
+        data_path: Path to the CSV data file.
+        experiment_name: MLflow experiment name.
+    Returns:
+        Path to the saved best model.
+    """
+    mlflow.set_experiment(experiment_name)
+    logger.info("Loading data from %s", data_path)
+    df = load_data(data_path)
+    logger.info("Preparing features...")
+    X, y = prepare_features(df)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+    logger.info(
+        "Train/test split: %d train samples, %d test samples",
+        len(X_train),
+        len(X_test),
+    )
+    preprocessor = create_preprocessor()
+    results = []
+    for model_name, model in MODELS.items():
+        with mlflow.start_run(run_name=model_name):
+            result = train_and_evaluate(
+                X_train, X_test, y_train, y_test, model_name, model, preprocessor
+            )
+            results.append(result)
+            mlflow.log_param("model_name", model_name)
+            mlflow.log_metric("rmse", result["rmse"])
+            mlflow.log_metric("mae", result["mae"])
+            mlflow.log_metric("r2", result["r2"])
+            mlflow.sklearn.log_model(result["pipeline"], "model")
+    best_result = min(results, key=lambda x: x["rmse"])
+    logger.info(
+        "Best model: %s with RMSE=%.2f",
+        best_result["model_name"],
+        best_result["rmse"],
+    )
+    MODELS_DIR.mkdir(parents=True, exist_ok=True)
+    model_path = MODELS_DIR / "best_model.joblib"
+    joblib.dump(best_result["pipeline"], model_path)
+    logger.info("Best model saved to %s", model_path)
+    return str(model_path)
+if __name__ == "__main__":
+    import sys
+    logging.basicConfig(level=logging.INFO)
+    data_path = (
+        sys.argv[1] if len(sys.argv) > 1 else "data/get_around_pricing_project.csv"
+    )
+    run_experiment(data_path)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff