Spaces:

Ma-Ri-Ba-Ku
/

IIIF-Studio

Build error

App Files Files Community

maribakulj commited on 20 days ago

Commit

0209857

unverified ·

2 Parent(s): 7a648f5 35a94af

Merge pull request #30 from maribakulj/claude/code-review-analysis-qDhlH

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +1 -1
backend/app/api/v1/corpora.py +12 -8
backend/app/api/v1/export.py +10 -4
backend/app/api/v1/ingest.py +80 -11
backend/app/api/v1/jobs.py +6 -2
backend/app/api/v1/models_api.py +10 -9
backend/app/api/v1/pages.py +6 -6
backend/app/api/v1/profiles.py +25 -8
backend/app/api/v1/search.py +29 -24
backend/app/config.py +10 -24
backend/app/main.py +5 -4
backend/app/models/corpus.py +7 -2
backend/app/models/job.py +5 -2
backend/app/schemas/page_master.py +24 -5
backend/app/services/ai/__init__.py +23 -11
backend/app/services/ai/analyzer.py +22 -15
backend/app/services/ai/master_writer.py +24 -24
backend/app/services/ai/model_registry.py +18 -5
backend/app/services/ai/prompt_loader.py +6 -0
backend/app/services/ai/provider_google_ai.py +11 -4
backend/app/services/ai/provider_mistral.py +16 -8
backend/app/services/ai/provider_vertex_key.py +0 -3
backend/app/services/ai/provider_vertex_sa.py +11 -4
backend/app/services/corpus_runner.py +2 -1
backend/app/services/export/alto.py +5 -9
backend/app/services/export/iiif.py +5 -5
backend/app/services/export/mets.py +8 -3
backend/app/services/ingest/iiif_fetcher.py +0 -1
backend/app/services/job_runner.py +4 -2
backend/pyproject.toml +1 -0
backend/tests/conftest_api.py +5 -4
backend/tests/test_ai_analyzer.py +26 -25
backend/tests/test_api_corrections.py +10 -3
backend/tests/test_api_export.py +1 -1
backend/tests/test_api_ingest.py +9 -0
backend/tests/test_api_models.py +4 -4
backend/tests/test_api_pages.py +1 -1
backend/tests/test_api_providers.py +11 -11
backend/tests/test_api_search.py +1 -1
backend/tests/test_export_alto.py +3 -2
backend/tests/test_export_iiif.py +13 -10
backend/tests/test_export_mets.py +6 -3
backend/tests/test_image_pipeline.py +0 -1
backend/tests/test_job_runner.py +28 -16
backend/tests/test_security.py +215 -0
frontend/src/App.tsx +1 -0
frontend/src/lib/api.ts +8 -0
frontend/src/pages/Editor.tsx +1 -1
frontend/src/pages/Home.tsx +3 -2
infra/Dockerfile +0 -71

Dockerfile CHANGED Viewed

@@ -1,6 +1,6 @@
 # Scriptorium AI — image de production (multi-stage)
 # Ce fichier est utilisé par HuggingFace Spaces (SDK docker, détection automatique).
-# Il doit rester synchronisé avec infra/Dockerfile.
 #
 # Build depuis la racine du dépôt :
 #   docker build -t scriptorium-ai .

 # Scriptorium AI — image de production (multi-stage)
 # Ce fichier est utilisé par HuggingFace Spaces (SDK docker, détection automatique).
+# Source unique — le fichier infra/Dockerfile a été supprimé pour éviter la divergence.
 #
 # Build depuis la racine du dépôt :
 #   docker build -t scriptorium-ai .

backend/app/api/v1/corpora.py CHANGED Viewed

@@ -14,8 +14,8 @@ import uuid
 from datetime import datetime, timezone
 # 2. third-party
-from fastapi import APIRouter, Depends, HTTPException
-from pydantic import BaseModel, ConfigDict
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -29,9 +29,9 @@ router = APIRouter(prefix="/corpora", tags=["corpora"])
 # ── Schémas de requête / réponse ─────────────────────────────────────────────
 class CorpusCreate(BaseModel):
-    slug: str
-    title: str
-    profile_id: str
 class CorpusResponse(BaseModel):
@@ -59,9 +59,13 @@ class ManuscriptResponse(BaseModel):
 # ── Endpoints ────────────────────────────────────────────────────────────────
 @router.get("", response_model=list[CorpusResponse])
-async def list_corpora(db: AsyncSession = Depends(get_db)) -> list[CorpusModel]:
-    """Retourne tous les corpus enregistrés."""
-    result = await db.execute(select(CorpusModel))
     return list(result.scalars().all())

 from datetime import datetime, timezone
 # 2. third-party
+from fastapi import APIRouter, Depends, HTTPException, Query
+from pydantic import BaseModel, ConfigDict, Field
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 # ── Schémas de requête / réponse ─────────────────────────────────────────────
 class CorpusCreate(BaseModel):
+    slug: str = Field(..., pattern=r"^[a-z0-9][a-z0-9_-]{0,63}$")
+    title: str = Field(..., min_length=1, max_length=256)
+    profile_id: str = Field(..., pattern=r"^[a-z0-9][a-z0-9_-]*$")
 class CorpusResponse(BaseModel):
 # ── Endpoints ────────────────────────────────────────────────────────────────
 @router.get("", response_model=list[CorpusResponse])
+async def list_corpora(
+    db: AsyncSession = Depends(get_db),
+    skip: int = Query(0, ge=0, description="Nombre d'éléments à sauter"),
+    limit: int = Query(100, ge=1, le=1000, description="Nombre maximum d'éléments"),
+) -> list[CorpusModel]:
+    """Retourne les corpus enregistrés (paginé)."""
+    result = await db.execute(select(CorpusModel).offset(skip).limit(limit))
     return list(result.scalars().all())

backend/app/api/v1/export.py CHANGED Viewed

@@ -10,6 +10,7 @@ Règle (R02) : toutes les sorties sont générées depuis les PageMasters
 (master.json), jamais depuis les réponses brutes de l'IA.
 """
 # 1. stdlib
 import io
 import json
 import logging
@@ -66,7 +67,7 @@ async def _load_manuscript_with_masters(
     masters: list[PageMaster] = []
     for page in pages:
-        master = _read_master_json(corpus.slug, page.id)
         if master is not None:
             masters.append(master)
@@ -79,8 +80,8 @@ async def _load_manuscript_with_masters(
     return manuscript, corpus, masters
-def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
-    """Lit le master.json d'une page depuis data/. Retourne None si absent."""
     path = (
         _config_module.settings.data_dir
         / "corpora"
@@ -95,6 +96,11 @@ def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
     return PageMaster.model_validate(raw)
 def _build_manuscript_meta(
     manuscript: ManuscriptModel, corpus: CorpusModel
 ) -> dict:
@@ -154,7 +160,7 @@ async def get_alto(page_id: str, db: AsyncSession = Depends(get_db)) -> Response
     manuscript = await db.get(ManuscriptModel, page.manuscript_id)
     corpus = await db.get(CorpusModel, manuscript.corpus_id)
-    master = _read_master_json(corpus.slug, page_id)
     if master is None:
         raise HTTPException(
             status_code=404,

 (master.json), jamais depuis les réponses brutes de l'IA.
 """
 # 1. stdlib
+import asyncio
 import io
 import json
 import logging
     masters: list[PageMaster] = []
     for page in pages:
+        master = await _read_master_json(corpus.slug, page.id)
         if master is not None:
             masters.append(master)
     return manuscript, corpus, masters
+def _read_master_json_sync(corpus_slug: str, page_id: str) -> PageMaster | None:
+    """Lit le master.json d'une page depuis data/. Retourne None si absent (bloquant)."""
     path = (
         _config_module.settings.data_dir
         / "corpora"
     return PageMaster.model_validate(raw)
+async def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
+    """Version async — délègue la lecture au threadpool."""
+    return await asyncio.to_thread(_read_master_json_sync, corpus_slug, page_id)
 def _build_manuscript_meta(
     manuscript: ManuscriptModel, corpus: CorpusModel
 ) -> dict:
     manuscript = await db.get(ManuscriptModel, page.manuscript_id)
     corpus = await db.get(CorpusModel, manuscript.corpus_id)
+    master = await _read_master_json(corpus.slug, page_id)
     if master is None:
         raise HTTPException(
             status_code=404,

backend/app/api/v1/ingest.py CHANGED Viewed

@@ -11,13 +11,14 @@ Règle : ingestion = création des PageModel en BDD uniquement.
 """
 # 1. stdlib
 import logging
 import uuid
 from pathlib import Path
 # 2. third-party
 import httpx
 from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
-from pydantic import BaseModel
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -30,6 +31,28 @@ logger = logging.getLogger(__name__)
 router = APIRouter(tags=["ingestion"])
 # ── Schémas ───────────────────────────────────────────────────────────────────
@@ -38,8 +61,8 @@ class IIIFManifestRequest(BaseModel):
 class IIIFImagesRequest(BaseModel):
-    urls: list[str]
-    folio_labels: list[str]
 class IngestResponse(BaseModel):
@@ -144,11 +167,31 @@ _MANIFEST_HEADERS = {
 }
 async def _fetch_json_manifest(url: str) -> dict:
-    """Télécharge un manifest IIIF. Fonction isolée pour faciliter les tests."""
     async with httpx.AsyncClient() as client:
         resp = await client.get(url, headers=_MANIFEST_HEADERS, follow_redirects=True, timeout=30.0)
         resp.raise_for_status()
         return resp.json()
@@ -202,16 +245,33 @@ async def ingest_files(
     seq = await _next_sequence(db, ms.id)
     # Collect labels and detect duplicates
-    labels = [Path(f.filename or f"file_{i}").stem for i, f in enumerate(files)]
     dupes = _find_duplicate_labels(labels)
     created: list[PageModel] = []
     skipped = 0
     for i, upload in enumerate(files):
-        filename = Path(upload.filename or f"file_{i}").name
         folio_label = labels[i]
         page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
         master_dir = (
             _config_module.settings.data_dir
             / "corpora"
@@ -221,8 +281,8 @@ async def ingest_files(
         )
         master_dir.mkdir(parents=True, exist_ok=True)
         master_path = master_dir / filename
-        content = await upload.read()
         master_path.write_bytes(content)
         page = await _create_page(
             db, ms.id, page_id, folio_label, seq + i,
@@ -234,7 +294,13 @@ async def ingest_files(
             created.append(page)
     ms.total_pages = (ms.total_pages or 0) + len(created)
-    await db.commit()
     logger.info(
         "Fichiers ingérés",
@@ -260,6 +326,8 @@ async def ingest_iiif_manifest(
     try:
         manifest = await _fetch_json_manifest(body.manifest_url)
     except httpx.HTTPStatusError as exc:
         raise HTTPException(
             status_code=502,
@@ -302,7 +370,7 @@ async def ingest_iiif_manifest(
     seq = await _next_sequence(db, ms.id)
     # Collect labels and detect duplicates
-    labels = [_extract_canvas_label(canvas, i) for i, canvas in enumerate(canvases)]
     dupes = _find_duplicate_labels(labels)
     created: list[PageModel] = []
@@ -358,11 +426,12 @@ async def ingest_iiif_images(
     ms = await _get_or_create_manuscript(db, corpus_id)
     seq = await _next_sequence(db, ms.id)
-    dupes = _find_duplicate_labels(body.folio_labels)
     created: list[PageModel] = []
     skipped = 0
-    for i, (url, folio_label) in enumerate(zip(body.urls, body.folio_labels)):
         page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
         page = await _create_page(
             db, ms.id, page_id, folio_label, seq + i,

 """
 # 1. stdlib
 import logging
+import re
 import uuid
 from pathlib import Path
 # 2. third-party
 import httpx
 from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
+from pydantic import BaseModel, Field
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 router = APIRouter(tags=["ingestion"])
+# ── Constantes de sécurité ────────────────────────────────────────────────────
+_SAFE_LABEL_RE = re.compile(r"^[\w\-\.]+$")
+_MAX_UPLOAD_BYTES = 100 * 1024 * 1024  # 100 Mo par fichier
+_ALLOWED_MIME_PREFIXES = ("image/",)
+def _sanitize_label(label: str) -> str:
+    """Nettoie un folio_label : garde uniquement alphanum, -, _, ."""
+    clean = Path(label).name  # retire tout chemin
+    if not _SAFE_LABEL_RE.match(clean) or not clean:
+        clean = re.sub(r"[^\w\-\.]", "_", clean) or "page"
+    return clean
+def _sanitize_filename(name: str) -> str:
+    """Nettoie un nom de fichier uploadé : garde uniquement le basename sûr."""
+    clean = Path(name).name
+    if not _SAFE_LABEL_RE.match(clean) or not clean:
+        clean = f"{uuid.uuid4().hex[:12]}.bin"
+    return clean
 # ── Schémas ───────────────────────────────────────────────────────────────────
 class IIIFImagesRequest(BaseModel):
+    urls: list[str] = Field(..., max_length=5000)
+    folio_labels: list[str] = Field(..., max_length=5000)
 class IngestResponse(BaseModel):
 }
+_MAX_MANIFEST_BYTES = 10 * 1024 * 1024  # 10 Mo max pour un manifest JSON
+def _validate_url(url: str) -> None:
+    """Rejette les URLs non-HTTP et les cibles réseau privé (SSRF)."""
+    from urllib.parse import urlparse
+    parsed = urlparse(url)
+    if parsed.scheme not in ("http", "https"):
+        raise ValueError(f"Schéma non autorisé : {parsed.scheme!r}")
+    host = (parsed.hostname or "").lower()
+    # Bloquer les adresses privées / locales
+    blocked = ("localhost", "127.0.0.1", "0.0.0.0", "[::1]", "metadata.google.internal")
+    if host in blocked or host.startswith("169.254.") or host.startswith("10.") or host.startswith("192.168."):
+        raise ValueError(f"Hôte interdit : {host}")
 async def _fetch_json_manifest(url: str) -> dict:
+    """Télécharge un manifest IIIF avec protections SSRF + taille max."""
+    _validate_url(url)
     async with httpx.AsyncClient() as client:
         resp = await client.get(url, headers=_MANIFEST_HEADERS, follow_redirects=True, timeout=30.0)
         resp.raise_for_status()
+        if len(resp.content) > _MAX_MANIFEST_BYTES:
+            raise ValueError(f"Manifest trop volumineux ({len(resp.content)} octets)")
         return resp.json()
     seq = await _next_sequence(db, ms.id)
     # Collect labels and detect duplicates
+    labels = [_sanitize_label(Path(f.filename or f"file_{i}").stem) for i, f in enumerate(files)]
     dupes = _find_duplicate_labels(labels)
     created: list[PageModel] = []
+    written_files: list[Path] = []
     skipped = 0
     for i, upload in enumerate(files):
+        # Validation MIME type
+        ctype = upload.content_type or ""
+        if not any(ctype.startswith(p) for p in _ALLOWED_MIME_PREFIXES):
+            raise HTTPException(
+                status_code=422,
+                detail=f"Type MIME non autorisé : {ctype!r}. Seules les images sont acceptées.",
+            )
+        filename = _sanitize_filename(upload.filename or f"file_{i}.bin")
         folio_label = labels[i]
         page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
+        content = await upload.read()
+        # Validation taille
+        if len(content) > _MAX_UPLOAD_BYTES:
+            raise HTTPException(
+                status_code=413,
+                detail=f"Fichier trop volumineux ({len(content)} octets). Maximum : {_MAX_UPLOAD_BYTES}.",
+            )
         master_dir = (
             _config_module.settings.data_dir
             / "corpora"
         )
         master_dir.mkdir(parents=True, exist_ok=True)
         master_path = master_dir / filename
         master_path.write_bytes(content)
+        written_files.append(master_path)
         page = await _create_page(
             db, ms.id, page_id, folio_label, seq + i,
             created.append(page)
     ms.total_pages = (ms.total_pages or 0) + len(created)
+    try:
+        await db.commit()
+    except Exception:
+        # Nettoyage des fichiers orphelins si le commit BDD échoue
+        for f in written_files:
+            f.unlink(missing_ok=True)
+        raise
     logger.info(
         "Fichiers ingérés",
     try:
         manifest = await _fetch_json_manifest(body.manifest_url)
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
     except httpx.HTTPStatusError as exc:
         raise HTTPException(
             status_code=502,
     seq = await _next_sequence(db, ms.id)
     # Collect labels and detect duplicates
+    labels = [_sanitize_label(_extract_canvas_label(canvas, i)) for i, canvas in enumerate(canvases)]
     dupes = _find_duplicate_labels(labels)
     created: list[PageModel] = []
     ms = await _get_or_create_manuscript(db, corpus_id)
     seq = await _next_sequence(db, ms.id)
+    sanitized_labels = [_sanitize_label(lbl) for lbl in body.folio_labels]
+    dupes = _find_duplicate_labels(sanitized_labels)
     created: list[PageModel] = []
     skipped = 0
+    for i, (url, folio_label) in enumerate(zip(body.urls, sanitized_labels)):
         page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
         page = await _create_page(
             db, ms.id, page_id, folio_label, seq + i,

backend/app/api/v1/jobs.py CHANGED Viewed

@@ -22,8 +22,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
 from app.models.database import get_db
 from app.models.job import JobModel
-from app.services.corpus_runner import execute_corpus_job
-from app.services.job_runner import execute_page_job
 router = APIRouter(tags=["jobs"])
@@ -101,6 +99,8 @@ async def run_corpus(
     await db.commit()
     # Lancer le pipeline en arrière-plan (après envoi de la réponse)
     background_tasks.add_task(execute_corpus_job, corpus_id)
     return CorpusRunResponse(
@@ -135,6 +135,8 @@ async def run_page(
     await db.refresh(job)
     # Lancer le pipeline en arrière-plan (après envoi de la réponse)
     background_tasks.add_task(execute_page_job, job.id)
     return job
@@ -175,6 +177,8 @@ async def retry_job(
     await db.refresh(job)
     # Relancer le pipeline
     background_tasks.add_task(execute_page_job, job.id)
     return job

 from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
 from app.models.database import get_db
 from app.models.job import JobModel
 router = APIRouter(tags=["jobs"])
     await db.commit()
     # Lancer le pipeline en arrière-plan (après envoi de la réponse)
+    from app.services.corpus_runner import execute_corpus_job
     background_tasks.add_task(execute_corpus_job, corpus_id)
     return CorpusRunResponse(
     await db.refresh(job)
     # Lancer le pipeline en arrière-plan (après envoi de la réponse)
+    from app.services.job_runner import execute_page_job
     background_tasks.add_task(execute_page_job, job.id)
     return job
     await db.refresh(job)
     # Relancer le pipeline
+    from app.services.job_runner import execute_page_job
     background_tasks.add_task(execute_page_job, job.id)
     return job

backend/app/api/v1/models_api.py CHANGED Viewed

@@ -17,7 +17,7 @@ from datetime import datetime, timezone
 # 2. third-party
 from fastapi import APIRouter, Depends, HTTPException
-from pydantic import BaseModel, ConfigDict
 from sqlalchemy.ext.asyncio import AsyncSession
 # 3. local
@@ -25,11 +25,6 @@ from app.models.corpus import CorpusModel
 from app.models.database import get_db
 from app.models.model_config_db import ModelConfigDB
 from app.schemas.model_config import ProviderType
-from app.services.ai.model_registry import (
-    get_available_providers,
-    list_all_models,
-    list_models_for_provider,
-)
 logger = logging.getLogger(__name__)
@@ -47,9 +42,9 @@ class ProviderInfo(BaseModel):
 class ModelSelectRequest(BaseModel):
-    model_id: str
-    provider_type: str
-    display_name: str = ""
 class ModelConfigResponse(BaseModel):
@@ -77,6 +72,8 @@ async def list_providers() -> list[dict]:
     Un provider est disponible si la variable d'environnement correspondante
     est présente dans les secrets HuggingFace. Aucune clé n'est exposée.
     """
     return get_available_providers()
@@ -91,6 +88,8 @@ async def get_provider_models(provider_type: str) -> list[dict]:
             detail=f"Provider inconnu : {provider_type}. "
                    f"Valeurs acceptées : {[p.value for p in ProviderType]}",
         )
     try:
         models = list_models_for_provider(ptype)
     except RuntimeError as exc:
@@ -104,6 +103,8 @@ async def get_provider_models(provider_type: str) -> list[dict]:
 @router.post("/models/refresh", response_model=ModelsRefreshResponse)
 async def refresh_models() -> ModelsRefreshResponse:
     """Force la mise à jour de la liste agrégée de tous les modèles disponibles."""
     models = list_all_models()
     return ModelsRefreshResponse(
         models=[m.model_dump() for m in models],

 # 2. third-party
 from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, ConfigDict, Field
 from sqlalchemy.ext.asyncio import AsyncSession
 # 3. local
 from app.models.database import get_db
 from app.models.model_config_db import ModelConfigDB
 from app.schemas.model_config import ProviderType
 logger = logging.getLogger(__name__)
 class ModelSelectRequest(BaseModel):
+    model_id: str = Field(..., min_length=1, max_length=256)
+    provider_type: str = Field(..., min_length=1, max_length=64)
+    display_name: str = Field("", max_length=256)
 class ModelConfigResponse(BaseModel):
     Un provider est disponible si la variable d'environnement correspondante
     est présente dans les secrets HuggingFace. Aucune clé n'est exposée.
     """
+    from app.services.ai.model_registry import get_available_providers
     return get_available_providers()
             detail=f"Provider inconnu : {provider_type}. "
                    f"Valeurs acceptées : {[p.value for p in ProviderType]}",
         )
+    from app.services.ai.model_registry import list_models_for_provider
     try:
         models = list_models_for_provider(ptype)
     except RuntimeError as exc:
 @router.post("/models/refresh", response_model=ModelsRefreshResponse)
 async def refresh_models() -> ModelsRefreshResponse:
     """Force la mise à jour de la liste agrégée de tous les modèles disponibles."""
+    from app.services.ai.model_registry import list_all_models
     models = list_all_models()
     return ModelsRefreshResponse(
         models=[m.model_dump() for m in models],

backend/app/api/v1/pages.py CHANGED Viewed

@@ -18,7 +18,7 @@ from typing import Any
 # 2. third-party
 from fastapi import APIRouter, Depends, HTTPException
-from pydantic import BaseModel, ConfigDict, ValidationError
 from sqlalchemy.ext.asyncio import AsyncSession
 # 3. local
@@ -42,12 +42,12 @@ class CorrectionsRequest(BaseModel):
     indiquée est restaurée (avec incrémentation de editorial.version).
     """
-    ocr_diplomatic_text: str | None = None
-    editorial_status: str | None = None
-    commentary_public: str | None = None
-    commentary_scholarly: str | None = None
     region_validations: dict[str, str] | None = None
-    restore_to_version: int | None = None
 class VersionInfo(BaseModel):

 # 2. third-party
 from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
 from sqlalchemy.ext.asyncio import AsyncSession
 # 3. local
     indiquée est restaurée (avec incrémentation de editorial.version).
     """
+    ocr_diplomatic_text: str | None = Field(None, max_length=500_000)
+    editorial_status: str | None = Field(None, max_length=50)
+    commentary_public: str | None = Field(None, max_length=100_000)
+    commentary_scholarly: str | None = Field(None, max_length=100_000)
     region_validations: dict[str, str] | None = None
+    restore_to_version: int | None = Field(None, ge=1)
 class VersionInfo(BaseModel):

backend/app/api/v1/profiles.py CHANGED Viewed

@@ -8,8 +8,10 @@ Les profils sont des fichiers JSON dans profiles/ (racine du dépôt).
 Ils sont validés par CorpusProfile avant d'être retournés.
 """
 # 1. stdlib
 import json
 import logging
 from pathlib import Path
 # 2. third-party
@@ -49,21 +51,36 @@ async def list_profiles() -> list[dict]:
     if not settings.profiles_dir.is_dir():
         logger.warning("profiles_dir introuvable : %s", settings.profiles_dir)
         return []
-    profiles = []
-    for path in sorted(settings.profiles_dir.glob("*.json")):
-        profile = _load_profile(path)
-        if profile is not None:
-            profiles.append(profile.model_dump())
-    return profiles
 @router.get("/{profile_id}", response_model=dict)
 async def get_profile(profile_id: str) -> dict:
     """Retourne un profil par son id (nom du fichier sans extension)."""
     path = settings.profiles_dir / f"{profile_id}.json"
-    if not path.exists():
         raise HTTPException(status_code=404, detail="Profil introuvable")
-    profile = _load_profile(path)
     if profile is None:
         raise HTTPException(status_code=422, detail="Profil invalide")
     return profile.model_dump()

 Ils sont validés par CorpusProfile avant d'être retournés.
 """
 # 1. stdlib
+import asyncio
 import json
 import logging
+import re
 from pathlib import Path
 # 2. third-party
     if not settings.profiles_dir.is_dir():
         logger.warning("profiles_dir introuvable : %s", settings.profiles_dir)
         return []
+    def _scan_profiles() -> list[dict]:
+        result = []
+        for path in sorted(settings.profiles_dir.glob("*.json")):
+            profile = _load_profile(path)
+            if profile is not None:
+                result.append(profile.model_dump())
+        return result
+    return await asyncio.to_thread(_scan_profiles)
+_SAFE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]*$")
 @router.get("/{profile_id}", response_model=dict)
 async def get_profile(profile_id: str) -> dict:
     """Retourne un profil par son id (nom du fichier sans extension)."""
+    if not _SAFE_ID_RE.match(profile_id):
+        raise HTTPException(status_code=400, detail="profile_id invalide")
     path = settings.profiles_dir / f"{profile_id}.json"
+    def _read() -> CorpusProfile | None:
+        if not path.exists():
+            return None
+        return _load_profile(path)
+    profile = await asyncio.to_thread(_read)
+    if profile is None and not path.exists():
         raise HTTPException(status_code=404, detail="Profil introuvable")
     if profile is None:
         raise HTTPException(status_code=422, detail="Profil invalide")
     return profile.model_dump()

backend/app/api/v1/search.py CHANGED Viewed

@@ -7,6 +7,7 @@ Implémentation MVP : scan des fichiers master.json (pas d'index externe).
 Insensible à la casse et aux accents (unicodedata NFD + ASCII).
 """
 # 1. stdlib
 import json
 import logging
 import unicodedata
@@ -95,7 +96,8 @@ def _score_master(data: dict, query_normalized: str) -> tuple[int, str]:
 @router.get("/search", response_model=list[SearchResult])
 async def search_pages(
-    q: str = Query(..., min_length=2, description="Requête de recherche (min. 2 caractères)"),
 ) -> list[SearchResult]:
     """Recherche plein texte dans les master.json de tous les corpus.
@@ -106,29 +108,32 @@ async def search_pages(
     query_normalized = _normalize(q.strip())
     data_dir = _config_module.settings.data_dir
-    results: list[SearchResult] = []
-    for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
-        try:
-            raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
-        except (json.JSONDecodeError, OSError):
-            continue
-        score, excerpt = _score_master(raw, query_normalized)
-        if score == 0:
-            continue
-        results.append(
-            SearchResult(
-                page_id=raw.get("page_id", ""),
-                folio_label=raw.get("folio_label", ""),
-                manuscript_id=raw.get("manuscript_id", ""),
-                excerpt=excerpt,
-                score=score,
-                corpus_profile=raw.get("corpus_profile", ""),
             )
-        )
-    results.sort(key=lambda r: r.score, reverse=True)
     logger.info("Recherche exécutée", extra={"q": q, "results": len(results)})
-    return results

 Insensible à la casse et aux accents (unicodedata NFD + ASCII).
 """
 # 1. stdlib
+import asyncio
 import json
 import logging
 import unicodedata
 @router.get("/search", response_model=list[SearchResult])
 async def search_pages(
+    q: str = Query(..., min_length=2, max_length=500, description="Requête de recherche (2–500 caractères)"),
+    limit: int = Query(200, ge=1, le=2000, description="Nombre maximum de résultats"),
 ) -> list[SearchResult]:
     """Recherche plein texte dans les master.json de tous les corpus.
     query_normalized = _normalize(q.strip())
     data_dir = _config_module.settings.data_dir
+    def _scan() -> list[SearchResult]:
+        """Scan bloquant exécuté dans un thread dédié."""
+        hits: list[SearchResult] = []
+        for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
+            try:
+                raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
+            except (json.JSONDecodeError, OSError):
+                continue
+            score, excerpt = _score_master(raw, query_normalized)
+            if score == 0:
+                continue
+            hits.append(
+                SearchResult(
+                    page_id=raw.get("page_id", ""),
+                    folio_label=raw.get("folio_label", ""),
+                    manuscript_id=raw.get("manuscript_id", ""),
+                    excerpt=excerpt,
+                    score=score,
+                    corpus_profile=raw.get("corpus_profile", ""),
+                )
             )
+        hits.sort(key=lambda r: r.score, reverse=True)
+        return hits
+    results = await asyncio.to_thread(_scan)
     logger.info("Recherche exécutée", extra={"q": q, "results": len(results)})
+    return results[:limit]

backend/app/config.py CHANGED Viewed

@@ -1,17 +1,17 @@
 """
 Configuration globale de la plateforme, chargée depuis les variables d'environnement.
-Équivalent fonctionnel de pydantic-settings sans dépendance externe :
-  - les valeurs sont lues depuis os.environ au moment de l'instanciation
   - l'objet `settings` est importé partout dans l'application
   - dans les tests : monkeypatch.setattr(config, "settings", ...) pour surcharger
 """
 # 1. stdlib
-import os
 from pathlib import Path
 # 2. third-party
-from pydantic import BaseModel, ConfigDict
 # Racine du dépôt — résolue depuis l'emplacement absolu de ce fichier.
 # config.py se trouve dans backend/app/ ; 3 parents remontent à la racine.
@@ -19,14 +19,17 @@ from pydantic import BaseModel, ConfigDict
 _REPO_ROOT = Path(__file__).resolve().parent.parent.parent
-class Settings(BaseModel):
     """Paramètres d'application lus depuis les variables d'environnement.
     Toutes les clés API sont optionnelles (None si non configurées).
     Elles ne sont jamais loguées ni exportées (R06).
     """
-    model_config = ConfigDict(frozen=False)
     # ── Serveur ──────────────────────────────────────────────────────────────
     base_url: str = "http://localhost:8000"
@@ -50,21 +53,4 @@ class Settings(BaseModel):
     mistral_api_key: str | None = None
-def _load_settings() -> Settings:
-    """Lit les variables d'environnement et construit l'objet Settings."""
-    return Settings(
-        base_url=os.getenv("BASE_URL", "http://localhost:8000"),
-        data_dir=Path(os.getenv("DATA_DIR", "data")),
-        profiles_dir=Path(os.getenv("PROFILES_DIR", str(_REPO_ROOT / "profiles"))),
-        prompts_dir=Path(os.getenv("PROMPTS_DIR", str(_REPO_ROOT / "prompts"))),
-        database_url=os.getenv(
-            "DATABASE_URL", "sqlite+aiosqlite:///./scriptorium.db"
-        ),
-        google_ai_studio_api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY"),
-        vertex_api_key=os.getenv("VERTEX_API_KEY"),
-        vertex_service_account_json=os.getenv("VERTEX_SERVICE_ACCOUNT_JSON"),
-        mistral_api_key=os.getenv("MISTRAL_API_KEY"),
-    )
-settings: Settings = _load_settings()

 """
 Configuration globale de la plateforme, chargée depuis les variables d'environnement.
+Utilise pydantic-settings (CLAUDE.md §2, §7) :
+  - les valeurs sont lues depuis os.environ / fichier .env au moment de l'instanciation
   - l'objet `settings` est importé partout dans l'application
   - dans les tests : monkeypatch.setattr(config, "settings", ...) pour surcharger
 """
 # 1. stdlib
 from pathlib import Path
 # 2. third-party
+from pydantic import ConfigDict
+from pydantic_settings import BaseSettings
 # Racine du dépôt — résolue depuis l'emplacement absolu de ce fichier.
 # config.py se trouve dans backend/app/ ; 3 parents remontent à la racine.
 _REPO_ROOT = Path(__file__).resolve().parent.parent.parent
+class Settings(BaseSettings):
     """Paramètres d'application lus depuis les variables d'environnement.
     Toutes les clés API sont optionnelles (None si non configurées).
     Elles ne sont jamais loguées ni exportées (R06).
     """
+    model_config = ConfigDict(
+        env_file=".env",
+        extra="ignore",
+    )
     # ── Serveur ──────────────────────────────────────────────────────────────
     base_url: str = "http://localhost:8000"
     mistral_api_key: str | None = None
+settings: Settings = Settings()

backend/app/main.py CHANGED Viewed

@@ -65,11 +65,11 @@ app = FastAPI(
     lifespan=lifespan,
 )
-# ── CORS (dev : tous les origines autorisés) ──────────────────────────────────
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
-    allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
@@ -97,8 +97,9 @@ async def serve_frontend(full_path: str) -> FileResponse | RedirectResponse:
     if full_path.startswith("api/"):
         raise HTTPException(status_code=404, detail=f"Endpoint not found: /{full_path}")
     if _STATIC_DIR.is_dir():
-        candidate = _STATIC_DIR / full_path
-        if candidate.is_file():
             return FileResponse(candidate)
         index = _STATIC_DIR / "index.html"
         if index.exists():

     lifespan=lifespan,
 )
+# ── CORS (dev : toutes les origines autorisées, sans credentials) ──────────────
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
+    allow_credentials=False,
     allow_methods=["*"],
     allow_headers=["*"],
 )
     if full_path.startswith("api/"):
         raise HTTPException(status_code=404, detail=f"Endpoint not found: /{full_path}")
     if _STATIC_DIR.is_dir():
+        candidate = (_STATIC_DIR / full_path).resolve()
+        # Empêcher le path traversal : le fichier résolu doit être sous _STATIC_DIR
+        if candidate.is_file() and str(candidate).startswith(str(_STATIC_DIR.resolve())):
             return FileResponse(candidate)
         index = _STATIC_DIR / "index.html"
         if index.exists():

backend/app/models/corpus.py CHANGED Viewed

@@ -6,6 +6,7 @@ Ils NE se substituent PAS aux schémas Pydantic (source canonique des types).
 """
 # 1. stdlib
 from datetime import datetime, timezone
 # 2. third-party
 from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, Text
@@ -24,8 +25,12 @@ class CorpusModel(Base):
     slug: Mapped[str] = mapped_column(String, unique=True, nullable=False, index=True)
     title: Mapped[str] = mapped_column(String, nullable=False)
     profile_id: Mapped[str] = mapped_column(String, nullable=False)
-    created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
-    updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
     manuscripts: Mapped[list["ManuscriptModel"]] = relationship(
         back_populates="corpus", cascade="all, delete-orphan"

 """
 # 1. stdlib
 from datetime import datetime, timezone
+from functools import partial
 # 2. third-party
 from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, Text
     slug: Mapped[str] = mapped_column(String, unique=True, nullable=False, index=True)
     title: Mapped[str] = mapped_column(String, nullable=False)
     profile_id: Mapped[str] = mapped_column(String, nullable=False)
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
+    )
+    updated_at: Mapped[datetime] = mapped_column(
+        DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
+    )
     manuscripts: Mapped[list["ManuscriptModel"]] = relationship(
         back_populates="corpus", cascade="all, delete-orphan"

backend/app/models/job.py CHANGED Viewed

@@ -10,7 +10,8 @@ Cycle de vie :
                    ↘ failed
 """
 # 1. stdlib
-from datetime import datetime
 # 2. third-party
 from sqlalchemy import DateTime, ForeignKey, String, Text
@@ -37,4 +38,6 @@ class JobModel(Base):
     started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
     finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
     error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
-    created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)

                    ↘ failed
 """
 # 1. stdlib
+from datetime import datetime, timezone
+from functools import partial
 # 2. third-party
 from sqlalchemy import DateTime, ForeignKey, String, Text
     started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
     finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
     error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
+    )

backend/app/schemas/page_master.py CHANGED Viewed

@@ -29,14 +29,25 @@ class Region(BaseModel):
     @field_validator("bbox")
     @classmethod
-    def bbox_must_be_positive(cls, v: list[int]) -> list[int]:
         if any(x < 0 for x in v):
-            raise ValueError("bbox values must be >= 0")
         if v[2] <= 0 or v[3] <= 0:
-            raise ValueError("bbox width and height must be > 0")
         return v
 class OCRResult(BaseModel):
     diplomatic_text: str = ""
     blocks: list[dict] = []
@@ -51,6 +62,13 @@ class Translation(BaseModel):
     en: str = ""
 class CommentaryClaim(BaseModel):
     claim: str
     evidence_region_ids: list[str] = []
@@ -64,6 +82,7 @@ class Commentary(BaseModel):
 class ProcessingInfo(BaseModel):
     model_id: str
     model_display_name: str
     prompt_version: str
@@ -96,11 +115,11 @@ class PageMaster(BaseModel):
     folio_label: str
     sequence: int
-    image: dict
     layout: dict
     ocr: OCRResult | None = None
     translation: Translation | None = None
-    summary: dict | None = None
     commentary: Commentary | None = None
     extensions: dict[str, Any] = {}

     @field_validator("bbox")
     @classmethod
+    def bbox_must_be_valid(cls, v: list[int]) -> list[int]:
         if any(x < 0 for x in v):
+            raise ValueError("bbox: toutes les valeurs doivent être >= 0")
         if v[2] <= 0 or v[3] <= 0:
+            raise ValueError("bbox: width et height doivent être > 0")
         return v
+class ImageInfo(BaseModel):
+    """Métadonnées image — CLAUDE.md §4.2."""
+    master: str
+    derivative_web: str | None = None
+    thumbnail: str | None = None
+    iiif_base: str | None = None
+    width: int
+    height: int
 class OCRResult(BaseModel):
     diplomatic_text: str = ""
     blocks: list[dict] = []
     en: str = ""
+class Summary(BaseModel):
+    """Résumé — CLAUDE.md §4.2."""
+    short: str = ""
+    detailed: str = ""
 class CommentaryClaim(BaseModel):
     claim: str
     evidence_region_ids: list[str] = []
 class ProcessingInfo(BaseModel):
+    provider: str
     model_id: str
     model_display_name: str
     prompt_version: str
     folio_label: str
     sequence: int
+    image: ImageInfo
     layout: dict
     ocr: OCRResult | None = None
     translation: Translation | None = None
+    summary: Summary | None = None
     commentary: Commentary | None = None
     extensions: dict[str, Any] = {}

backend/app/services/ai/__init__.py CHANGED Viewed

@@ -1,19 +1,31 @@
 """
 Services AI — providers Google AI, registre de modèles, et analyse IA.
 """
-from app.services.ai.analyzer import run_primary_analysis
-from app.services.ai.client_factory import build_client
-from app.services.ai.model_registry import build_model_config, list_all_models
-from app.services.ai.prompt_loader import load_and_render_prompt
-from app.services.ai.provider_google_ai import GoogleAIProvider
-from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
-from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
-from app.services.ai.response_parser import ParseError, parse_ai_response
 __all__ = [
-    "GoogleAIProvider",
-    "VertexAPIKeyProvider",
-    "VertexServiceAccountProvider",
     "list_all_models",
     "build_model_config",
     "build_client",

 """
 Services AI — providers Google AI, registre de modèles, et analyse IA.
+Les imports de providers sont différés (lazy) pour éviter de charger les SDK
+tiers (google-genai, mistralai) au démarrage. Cela permet à l'application
+de fonctionner même si un SDK n'est pas installé.
 """
+def __getattr__(name: str):
+    """Import paresseux — les symboles sont résolus au premier accès."""
+    _lazy_map = {
+        "run_primary_analysis": "app.services.ai.analyzer",
+        "build_client": "app.services.ai.client_factory",
+        "build_model_config": "app.services.ai.model_registry",
+        "list_all_models": "app.services.ai.model_registry",
+        "load_and_render_prompt": "app.services.ai.prompt_loader",
+        "parse_ai_response": "app.services.ai.response_parser",
+        "ParseError": "app.services.ai.response_parser",
+    }
+    if name in _lazy_map:
+        import importlib
+        module = importlib.import_module(_lazy_map[name])
+        return getattr(module, name)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 __all__ = [
     "list_all_models",
     "build_model_config",
     "build_client",

backend/app/services/ai/analyzer.py CHANGED Viewed

@@ -13,8 +13,8 @@ from pathlib import Path
 from app.schemas.corpus_profile import CorpusProfile
 from app.schemas.image import ImageDerivativeInfo
 from app.schemas.model_config import ModelConfig
-from app.schemas.page_master import EditorialInfo, EditorialStatus, PageMaster, ProcessingInfo
-from app.services.ai.master_writer import write_gemini_raw, write_master_json
 from app.services.ai.model_registry import get_provider
 from app.services.ai.prompt_loader import load_and_render_prompt
 from app.services.ai.response_parser import ParseError, parse_ai_response  # noqa: F401
@@ -37,7 +37,7 @@ def run_primary_analysis(
 ) -> PageMaster:
     """Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
-    Respecte R05 : gemini_raw.json est toujours écrit en premier, même en cas
     d'erreur de parsing. master.json n'est écrit QUE si le parsing a réussi.
     Le provider est sélectionné dynamiquement depuis model_config.provider ;
@@ -57,7 +57,7 @@ def run_primary_analysis(
         project_root: racine du projet (pour résoudre les chemins des prompts).
     Returns:
-        PageMaster validé (gemini_raw.json et master.json écrits sur disque).
     Raises:
         ParseError: si la réponse IA n'est pas un JSON valide.
@@ -66,7 +66,7 @@ def run_primary_analysis(
     """
     # ── Chemins de sortie ───────────────────────────────────────────────────
     page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
-    raw_path = page_dir / "gemini_raw.json"
     master_path = page_dir / "master.json"
     # ── 1. Chargement et rendu du prompt (R04) ──────────────────────────────
@@ -76,6 +76,7 @@ def run_primary_analysis(
     context = {
         "profile_label": corpus_profile.label,
         "language_hints": ", ".join(corpus_profile.language_hints),
         "script_type": corpus_profile.script_type.value,
     }
     prompt_text = load_and_render_prompt(prompt_abs_path, context)
@@ -85,7 +86,12 @@ def run_primary_analysis(
     )
     # ── 2. Chargement de l'image dérivée ────────────────────────────────────
-    jpeg_bytes = derivative_image_path.read_bytes()
     # ── 3. Appel IA via le provider sélectionné ─────────────────────────────
     provider = get_provider(model_config.provider)
@@ -104,8 +110,8 @@ def run_primary_analysis(
         model_id=model_config.selected_model_id,
     )
-    # ── 4. Écriture gemini_raw.json TOUJOURS EN PREMIER (R05) ───────────────
-    write_gemini_raw(raw_text, raw_path)
     # ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
     layout, ocr = parse_ai_response(raw_text)
@@ -118,16 +124,17 @@ def run_primary_analysis(
         manuscript_id=manuscript_id,
         folio_label=folio_label,
         sequence=sequence,
-        image={
-            "original_url": image_info.original_url,
-            "derivative_web": image_info.derivative_path,
-            "thumbnail": image_info.thumbnail_path,
-            "width": image_info.derivative_width,
-            "height": image_info.derivative_height,
-        },
         layout=layout,
         ocr=ocr,
         processing=ProcessingInfo(
             model_id=model_config.selected_model_id,
             model_display_name=model_config.selected_model_display_name,
             prompt_version=prompt_rel_path,

 from app.schemas.corpus_profile import CorpusProfile
 from app.schemas.image import ImageDerivativeInfo
 from app.schemas.model_config import ModelConfig
+from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
+from app.services.ai.master_writer import write_ai_raw, write_master_json
 from app.services.ai.model_registry import get_provider
 from app.services.ai.prompt_loader import load_and_render_prompt
 from app.services.ai.response_parser import ParseError, parse_ai_response  # noqa: F401
 ) -> PageMaster:
     """Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
+    Respecte R05 : ai_raw.json est toujours écrit en premier, même en cas
     d'erreur de parsing. master.json n'est écrit QUE si le parsing a réussi.
     Le provider est sélectionné dynamiquement depuis model_config.provider ;
         project_root: racine du projet (pour résoudre les chemins des prompts).
     Returns:
+        PageMaster validé (ai_raw.json et master.json écrits sur disque).
     Raises:
         ParseError: si la réponse IA n'est pas un JSON valide.
     """
     # ── Chemins de sortie ───────────────────────────────────────────────────
     page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
+    raw_path = page_dir / "ai_raw.json"
     master_path = page_dir / "master.json"
     # ── 1. Chargement et rendu du prompt (R04) ──────────────────────────────
     context = {
         "profile_label": corpus_profile.label,
         "language_hints": ", ".join(corpus_profile.language_hints),
+        "primary_language": corpus_profile.language_hints[0] if corpus_profile.language_hints else "la",
         "script_type": corpus_profile.script_type.value,
     }
     prompt_text = load_and_render_prompt(prompt_abs_path, context)
     )
     # ── 2. Chargement de l'image dérivée ────────────────────────────────────
+    if not derivative_image_path.exists():
+        raise FileNotFoundError(f"Image dérivée introuvable : {derivative_image_path}")
+    try:
+        jpeg_bytes = derivative_image_path.read_bytes()
+    except OSError as exc:
+        raise RuntimeError(f"Erreur lecture image {derivative_image_path} : {exc}") from exc
     # ── 3. Appel IA via le provider sélectionné ─────────────────────────────
     provider = get_provider(model_config.provider)
         model_id=model_config.selected_model_id,
     )
+    # ── 4. Écriture ai_raw.json TOUJOURS EN PREMIER (R05) ─────────────────
+    write_ai_raw(raw_text, raw_path)
     # ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
     layout, ocr = parse_ai_response(raw_text)
         manuscript_id=manuscript_id,
         folio_label=folio_label,
         sequence=sequence,
+        image=ImageInfo(
+            master=image_info.original_url,
+            derivative_web=image_info.derivative_path,
+            thumbnail=image_info.thumbnail_path,
+            width=image_info.derivative_width,
+            height=image_info.derivative_height,
+        ),
         layout=layout,
         ocr=ocr,
         processing=ProcessingInfo(
+            provider=model_config.provider.value if hasattr(model_config.provider, "value") else str(model_config.provider),
             model_id=model_config.selected_model_id,
             model_display_name=model_config.selected_model_display_name,
             prompt_version=prompt_rel_path,

backend/app/services/ai/master_writer.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
-Écriture des fichiers gemini_raw.json et master.json (R02, R05).
 Règle R05 non négociable :
-  1. gemini_raw.json est TOUJOURS écrit en premier.
   2. master.json n'est écrit QUE si le parsing et la validation Pydantic ont réussi.
 """
 # 1. stdlib
@@ -16,24 +16,24 @@ from app.schemas.page_master import PageMaster
 logger = logging.getLogger(__name__)
-def write_gemini_raw(raw_text: str, output_path: Path) -> None:
-    """Écrit la réponse brute de l'IA dans gemini_raw.json (R05).
     Toujours appelé AVANT toute tentative de parsing.
     Le contenu est enveloppé dans un objet JSON pour garantir un fichier valide,
     même si la réponse IA n'est pas du JSON.
-    Args:
-        raw_text: texte brut retourné par l'API Google AI.
-        output_path: chemin complet du fichier de sortie (gemini_raw.json).
     """
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    payload = {"response_text": raw_text}
-    output_path.write_text(
-        json.dumps(payload, ensure_ascii=False, indent=2),
-        encoding="utf-8",
-    )
-    logger.info("gemini_raw.json écrit", extra={"path": str(output_path)})
 def write_master_json(page_master: PageMaster, output_path: Path) -> None:
@@ -41,14 +41,14 @@ def write_master_json(page_master: PageMaster, output_path: Path) -> None:
     N'est appelé QUE si le parsing et la validation Pydantic ont réussi.
     Crée les dossiers parents si nécessaire.
-    Args:
-        page_master: instance PageMaster validée par Pydantic.
-        output_path: chemin complet du fichier de sortie (master.json).
     """
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    output_path.write_text(
-        page_master.model_dump_json(indent=2),
-        encoding="utf-8",
-    )
     logger.info("master.json écrit", extra={"path": str(output_path)})

 """
+Écriture des fichiers ai_raw.json et master.json (R02, R05).
 Règle R05 non négociable :
+  1. ai_raw.json est TOUJOURS écrit en premier.
   2. master.json n'est écrit QUE si le parsing et la validation Pydantic ont réussi.
 """
 # 1. stdlib
 logger = logging.getLogger(__name__)
+def write_ai_raw(raw_text: str, output_path: Path) -> None:
+    """Écrit la réponse brute de l'IA dans ai_raw.json (R05).
     Toujours appelé AVANT toute tentative de parsing.
     Le contenu est enveloppé dans un objet JSON pour garantir un fichier valide,
     même si la réponse IA n'est pas du JSON.
     """
+    try:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        payload = {"response_text": raw_text}
+        output_path.write_text(
+            json.dumps(payload, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+    except OSError as exc:
+        logger.error("Écriture ai_raw.json échouée", extra={"path": str(output_path), "error": str(exc)})
+        raise
+    logger.info("ai_raw.json écrit", extra={"path": str(output_path)})
 def write_master_json(page_master: PageMaster, output_path: Path) -> None:
     N'est appelé QUE si le parsing et la validation Pydantic ont réussi.
     Crée les dossiers parents si nécessaire.
     """
+    try:
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(
+            page_master.model_dump_json(indent=2),
+            encoding="utf-8",
+        )
+    except OSError as exc:
+        logger.error("Écriture master.json échouée", extra={"path": str(output_path), "error": str(exc)})
+        raise
     logger.info("master.json écrit", extra={"path": str(output_path)})

backend/app/services/ai/model_registry.py CHANGED Viewed

@@ -1,5 +1,8 @@
 """
 Registre agrégé des modèles disponibles tous providers confondus.
 """
 # 1. stdlib
 import logging
@@ -8,10 +11,6 @@ from datetime import datetime, timezone
 # 2. local
 from app.schemas.model_config import ModelConfig, ModelInfo, ProviderType
 from app.services.ai.base import AIProvider
-from app.services.ai.provider_google_ai import GoogleAIProvider
-from app.services.ai.provider_mistral import MistralProvider
-from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
-from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
 logger = logging.getLogger(__name__)
@@ -24,13 +23,27 @@ _PROVIDER_DISPLAY_NAMES: dict[ProviderType, str] = {
 }
 def _build_providers() -> list[AIProvider]:
-    return [
         GoogleAIProvider(),
         VertexAPIKeyProvider(),
         VertexServiceAccountProvider(),
         MistralProvider(),
     ]
 def get_available_providers() -> list[dict]:

 """
 Registre agrégé des modèles disponibles tous providers confondus.
+Les imports de providers sont différés dans _build_providers() pour éviter
+de charger les SDK tiers (google-genai, mistralai) au niveau module.
 """
 # 1. stdlib
 import logging
 # 2. local
 from app.schemas.model_config import ModelConfig, ModelInfo, ProviderType
 from app.services.ai.base import AIProvider
 logger = logging.getLogger(__name__)
 }
+_cached_providers: list[AIProvider] | None = None
 def _build_providers() -> list[AIProvider]:
+    """Construit la liste des providers — imports différés, résultat mis en cache."""
+    global _cached_providers
+    if _cached_providers is not None:
+        return _cached_providers
+    from app.services.ai.provider_google_ai import GoogleAIProvider
+    from app.services.ai.provider_mistral import MistralProvider
+    from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
+    from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
+    _cached_providers = [
         GoogleAIProvider(),
         VertexAPIKeyProvider(),
         VertexServiceAccountProvider(),
         MistralProvider(),
     ]
+    return _cached_providers
 def get_available_providers() -> list[dict]:

backend/app/services/ai/prompt_loader.py CHANGED Viewed

@@ -6,6 +6,7 @@ Le code charge le fichier, substitue les variables {{nom}}, envoie à l'API.
 """
 # 1. stdlib
 import logging
 from pathlib import Path
 logger = logging.getLogger(__name__)
@@ -38,6 +39,11 @@ def load_and_render_prompt(template_path: str | Path, context: dict[str, str]) -
     for key, value in context.items():
         rendered = rendered.replace("{{" + key + "}}", value)
     logger.debug(
         "Prompt chargé et rendu",
         extra={"template": str(path), "variables": list(context.keys())},

 """
 # 1. stdlib
 import logging
+import re
 from pathlib import Path
 logger = logging.getLogger(__name__)
     for key, value in context.items():
         rendered = rendered.replace("{{" + key + "}}", value)
+    # Vérifier qu'il ne reste pas de variables non résolues (CLAUDE.md §8)
+    unresolved = re.findall(r"\{\{\w+\}\}", rendered)
+    if unresolved:
+        raise ValueError(f"Variables non résolues dans le prompt : {unresolved}")
     logger.debug(
         "Prompt chargé et rendu",
         extra={"template": str(path), "variables": list(context.keys())},

backend/app/services/ai/provider_google_ai.py CHANGED Viewed

@@ -60,8 +60,15 @@ class GoogleAIProvider(AIProvider):
             raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
         client = genai.Client(api_key=os.environ[_ENV_KEY])
         image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
-        response = client.models.generate_content(
-            model=model_id,
-            contents=[image_part, prompt],
-        )
         return response.text or ""

             raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
         client = genai.Client(api_key=os.environ[_ENV_KEY])
         image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
+        try:
+            response = client.models.generate_content(
+                model=model_id,
+                contents=[image_part, prompt],
+            )
+        except Exception as exc:
+            logger.error(
+                "Appel API Google AI Studio échoué",
+                extra={"model": model_id, "error": str(exc)},
+            )
+            raise RuntimeError(f"Erreur API Google AI Studio ({model_id}) : {exc}") from exc
         return response.text or ""

backend/app/services/ai/provider_mistral.py CHANGED Viewed

@@ -208,10 +208,14 @@ class MistralProvider(AIProvider):
         # ── Chemin 1 : OCR dédié ─────────────────────────────────────────────
         if _is_ocr_model(model_id):
             logger.info("Mistral OCR : endpoint dédié client.ocr.process()", extra={"model": model_id})
-            response = client.ocr.process(
-                model=model_id,
-                document={"type": "image_url", "image_url": {"url": data_url}},
-            )
             # OCRResponse.pages : list[OCRPageObject], chacun avec .markdown
             pages = getattr(response, "pages", []) or []
             return "\n\n".join(
@@ -233,10 +237,14 @@ class MistralProvider(AIProvider):
             )
             content = prompt
-        response = client.chat.complete(
-            model=model_id,
-            messages=[{"role": "user", "content": content}],
-        )
         choices = response.choices or []
         if not choices:
             return ""

         # ── Chemin 1 : OCR dédié ─────────────────────────────────────────────
         if _is_ocr_model(model_id):
             logger.info("Mistral OCR : endpoint dédié client.ocr.process()", extra={"model": model_id})
+            try:
+                response = client.ocr.process(
+                    model=model_id,
+                    document={"type": "image_url", "image_url": {"url": data_url}},
+                )
+            except Exception as exc:
+                logger.error("Appel Mistral OCR échoué", extra={"model": model_id, "error": str(exc)})
+                raise RuntimeError(f"Erreur API Mistral OCR ({model_id}) : {exc}") from exc
             # OCRResponse.pages : list[OCRPageObject], chacun avec .markdown
             pages = getattr(response, "pages", []) or []
             return "\n\n".join(
             )
             content = prompt
+        try:
+            response = client.chat.complete(
+                model=model_id,
+                messages=[{"role": "user", "content": content}],
+            )
+        except Exception as exc:
+            logger.error("Appel Mistral chat échoué", extra={"model": model_id, "error": str(exc)})
+            raise RuntimeError(f"Erreur API Mistral ({model_id}) : {exc}") from exc
         choices = response.choices or []
         if not choices:
             return ""

backend/app/services/ai/provider_vertex_key.py CHANGED Viewed

@@ -22,9 +22,6 @@ retourne toujours False afin d'éviter des appels réseau voués à l'échec.
 import logging
 import os
-# 2. third-party
-from google.genai import types  # noqa: F401  (conservé pour import cohérence)
 # 3. local
 from app.schemas.model_config import ModelInfo, ProviderType
 from app.services.ai.base import AIProvider

 import logging
 import os
 # 3. local
 from app.schemas.model_config import ModelInfo, ProviderType
 from app.services.ai.base import AIProvider

backend/app/services/ai/provider_vertex_sa.py CHANGED Viewed

@@ -90,8 +90,15 @@ class VertexServiceAccountProvider(AIProvider):
             raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
         client = self._build_client()
         image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
-        response = client.models.generate_content(
-            model=model_id,
-            contents=[image_part, prompt],
-        )
         return response.text or ""

             raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
         client = self._build_client()
         image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
+        try:
+            response = client.models.generate_content(
+                model=model_id,
+                contents=[image_part, prompt],
+            )
+        except Exception as exc:
+            logger.error(
+                "Appel API Vertex AI échoué",
+                extra={"model": model_id, "error": str(exc)},
+            )
+            raise RuntimeError(f"Erreur API Vertex AI ({model_id}) : {exc}") from exc
         return response.text or ""

backend/app/services/corpus_runner.py CHANGED Viewed

@@ -17,7 +17,6 @@ from sqlalchemy import select
 # 3. local
 from app.models.database import async_session_factory
 from app.models.job import JobModel
-from app.services.job_runner import execute_page_job
 logger = logging.getLogger(__name__)
@@ -54,6 +53,8 @@ async def execute_corpus_job(corpus_id: str) -> dict:
     )
     # Exécution séquentielle — chaque job gère sa propre session
     for job_id in job_ids:
         await execute_page_job(job_id)

 # 3. local
 from app.models.database import async_session_factory
 from app.models.job import JobModel
 logger = logging.getLogger(__name__)
     )
     # Exécution séquentielle — chaque job gère sa propre session
+    from app.services.job_runner import execute_page_job
     for job_id in job_ids:
         await execute_page_job(job_id)

backend/app/services/export/alto.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 Générateur ALTO v4 depuis un PageMaster validé (R02).
-Source canonique : PageMaster uniquement — jamais la réponse brute gemini_raw.json.
 bbox [x, y, width, height] → HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
 Mapping RegionType → élément ALTO :
@@ -82,7 +82,7 @@ def _build_text_block(
         text = fallback_text
     if not text:
-        return  # TextBlock vide — valide ALTO
     x, y, w, h = region.bbox
     line_el = etree.SubElement(
@@ -160,11 +160,7 @@ def generate_alto(master: PageMaster) -> str:
     etree.SubElement(desc, _a("MeasurementUnit")).text = "pixel"
     src_info = etree.SubElement(desc, _a("sourceImageInformation"))
-    file_name = (
-        master.image.get("original_url")
-        or master.image.get("derivative_web")
-        or master.page_id
-    )
     etree.SubElement(src_info, _a("fileName")).text = str(file_name)
     if master.processing:
@@ -185,8 +181,8 @@ def generate_alto(master: PageMaster) -> str:
     # ── Layout ─────────────────────────────────────────────────────────────
     layout_el = etree.SubElement(root, _a("Layout"))
-    width = int(master.image.get("width", 0))
-    height = int(master.image.get("height", 0))
     page_id_safe = master.page_id.replace(" ", "_")
     page_el = etree.SubElement(

 """
 Générateur ALTO v4 depuis un PageMaster validé (R02).
+Source canonique : PageMaster uniquement — jamais la réponse brute ai_raw.json.
 bbox [x, y, width, height] → HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
 Mapping RegionType → élément ALTO :
         text = fallback_text
     if not text:
+        return  # TextBlock sans TextLine — valide ALTO, région visible dans le layout
     x, y, w, h = region.bbox
     line_el = etree.SubElement(
     etree.SubElement(desc, _a("MeasurementUnit")).text = "pixel"
     src_info = etree.SubElement(desc, _a("sourceImageInformation"))
+    file_name = master.image.master or master.image.derivative_web or master.page_id
     etree.SubElement(src_info, _a("fileName")).text = str(file_name)
     if master.processing:
     # ── Layout ─────────────────────────────────────────────────────────────
     layout_el = etree.SubElement(root, _a("Layout"))
+    width = master.image.width
+    height = master.image.height
     page_id_safe = master.page_id.replace(" ", "_")
     page_el = etree.SubElement(

backend/app/services/export/iiif.py CHANGED Viewed

@@ -74,7 +74,7 @@ def generate_manifest(
     manuscript_id = manuscript_meta["manuscript_id"]
     label         = manuscript_meta["label"]
-    language      = manuscript_meta.get("language") or "none"
     # Pages dans l'ordre de séquence (règle absolue — structMap PHYSICAL)
     pages = sorted(masters, key=lambda m: m.sequence)
@@ -102,17 +102,17 @@ def generate_manifest(
         canvas_id = (
             f"{base_url}/api/v1/manuscripts/{manuscript_id}/canvas/{page.page_id}"
         )
-        width  = int(page.image.get("width",  0))
-        height = int(page.image.get("height", 0))
         annotation_page_id = f"{canvas_id}/annotation-page/1"
         annotation_id      = f"{canvas_id}/annotation/painting"
-        image_url          = page.image.get("original_url", "")
         canvas: dict = {
             "id":     canvas_id,
             "type":   "Canvas",
-            "label":  {"none": [f"Folio {page.folio_label}"]},
             "width":  width,
             "height": height,
             "items": [

     manuscript_id = manuscript_meta["manuscript_id"]
     label         = manuscript_meta["label"]
+    language      = manuscript_meta.get("language") or "en"
     # Pages dans l'ordre de séquence (règle absolue — structMap PHYSICAL)
     pages = sorted(masters, key=lambda m: m.sequence)
         canvas_id = (
             f"{base_url}/api/v1/manuscripts/{manuscript_id}/canvas/{page.page_id}"
         )
+        width  = page.image.width
+        height = page.image.height
         annotation_page_id = f"{canvas_id}/annotation-page/1"
         annotation_id      = f"{canvas_id}/annotation/painting"
+        image_url          = page.image.master or ""
         canvas: dict = {
             "id":     canvas_id,
             "type":   "Canvas",
+            "label":  {language: [f"Folio {page.folio_label}"]},
             "width":  width,
             "height": height,
             "items": [

backend/app/services/export/mets.py CHANGED Viewed

@@ -182,7 +182,7 @@ def generate_mets(
         f_master = _el(grp_master, f"{_M}file", {"ID": f"IMG_MASTER_{sid}", "MIMETYPE": "image/jpeg"})
         _el(f_master, f"{_M}FLocat", {
             "LOCTYPE": "URL",
-            f"{_XL}href": page.image.get("original_url", ""),
             f"{_XL}type": "simple",
         })
@@ -191,12 +191,17 @@ def generate_mets(
         _el(f_deriv, f"{_M}FLocat", {
             "LOCTYPE": "OTHER",
             "OTHERLOCTYPE": "filepath",
-            f"{_XL}href": page.image.get("derivative_web", ""),
             f"{_XL}type": "simple",
         })
-        # ALTO
         alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
         f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
         _el(f_alto, f"{_M}FLocat", {
             "LOCTYPE": "OTHER",

         f_master = _el(grp_master, f"{_M}file", {"ID": f"IMG_MASTER_{sid}", "MIMETYPE": "image/jpeg"})
         _el(f_master, f"{_M}FLocat", {
             "LOCTYPE": "URL",
+            f"{_XL}href": page.image.master or "",
             f"{_XL}type": "simple",
         })
         _el(f_deriv, f"{_M}FLocat", {
             "LOCTYPE": "OTHER",
             "OTHERLOCTYPE": "filepath",
+            f"{_XL}href": page.image.derivative_web or "",
             f"{_XL}type": "simple",
         })
+        # ALTO (référence conditionnelle — warning si le fichier n'existe pas encore)
         alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
+        if not Path(alto_p).exists():
+            logger.warning(
+                "Fichier ALTO absent — la référence METS sera cassée tant que l'ALTO n'est pas généré",
+                extra={"alto_path": alto_p, "page_id": page.page_id},
+            )
         f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
         _el(f_alto, f"{_M}FLocat", {
             "LOCTYPE": "OTHER",

backend/app/services/ingest/iiif_fetcher.py CHANGED Viewed

@@ -17,7 +17,6 @@ _HEADERS = {
         "+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
     ),
     "Accept": "image/jpeg,image/png,image/*,*/*",
-    "Referer": "https://gallica.bnf.fr/",
 }

         "+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
     ),
     "Accept": "image/jpeg,image/png,image/*,*/*",
 }

backend/app/services/job_runner.py CHANGED Viewed

@@ -33,8 +33,6 @@ from app.models.job import JobModel
 from app.models.model_config_db import ModelConfigDB
 from app.schemas.corpus_profile import CorpusProfile
 from app.schemas.model_config import ModelConfig, ProviderType
-from app.services.ai.analyzer import run_primary_analysis
-from app.services.export.alto import generate_alto, write_alto
 from app.services.image.normalizer import create_derivatives, fetch_and_normalize
 logger = logging.getLogger(__name__)
@@ -148,6 +146,8 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
             )
         # ── 6. Analyse primaire IA (R05 : double stockage) ───────────────────
         page_master = run_primary_analysis(
             derivative_image_path=Path(image_info.derivative_path),
             corpus_profile=corpus_profile,
@@ -163,6 +163,8 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
         )
         # ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
         alto_xml = generate_alto(page_master)
         alto_path = (
             data_dir

 from app.models.model_config_db import ModelConfigDB
 from app.schemas.corpus_profile import CorpusProfile
 from app.schemas.model_config import ModelConfig, ProviderType
 from app.services.image.normalizer import create_derivatives, fetch_and_normalize
 logger = logging.getLogger(__name__)
             )
         # ── 6. Analyse primaire IA (R05 : double stockage) ───────────────────
+        from app.services.ai.analyzer import run_primary_analysis
         page_master = run_primary_analysis(
             derivative_image_path=Path(image_info.derivative_path),
             corpus_profile=corpus_profile,
         )
         # ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
+        from app.services.export.alto import generate_alto, write_alto
         alto_xml = generate_alto(page_master)
         alto_path = (
             data_dir

backend/pyproject.toml CHANGED Viewed

@@ -11,6 +11,7 @@ dependencies = [
     "fastapi>=0.111",
     "uvicorn[standard]>=0.29",
     "pydantic>=2.7",
     "sqlalchemy>=2.0",
     "aiosqlite>=0.20",
     "google-genai>=1.0",

     "fastapi>=0.111",
     "uvicorn[standard]>=0.29",
     "pydantic>=2.7",
+    "pydantic-settings>=2.0",
     "sqlalchemy>=2.0",
     "aiosqlite>=0.20",
     "google-genai>=1.0",

backend/tests/conftest_api.py CHANGED Viewed

@@ -51,10 +51,11 @@ async def async_client(db_session: AsyncSession):
     app.dependency_overrides[get_db] = _override_get_db
     # Les background tasks (execute_corpus_job, execute_page_job) créent leur
-    # propre session via async_session_factory. On les neutralise pour éviter
-    # qu'elles tentent de se connecter à la BDD réelle pendant les tests d'API.
-    with patch("app.api.v1.jobs.execute_corpus_job", AsyncMock(return_value=None)), \
-         patch("app.api.v1.jobs.execute_page_job", AsyncMock(return_value=None)):
         async with AsyncClient(
             transport=ASGITransport(app=app), base_url="http://test"
         ) as client:

     app.dependency_overrides[get_db] = _override_get_db
     # Les background tasks (execute_corpus_job, execute_page_job) créent leur
+    # propre session via async_session_factory. On les neutralise en mockant
+    # les modules sources pour éviter qu'elles tentent de se connecter à la
+    # BDD réelle pendant les tests d'API.
+    with patch("app.services.corpus_runner.execute_corpus_job", AsyncMock(return_value={"total": 0, "done": 0, "failed": 0})), \
+         patch("app.services.job_runner.execute_page_job", AsyncMock(return_value=None)):
         async with AsyncClient(
             transport=ASGITransport(app=app), base_url="http://test"
         ) as client:

backend/tests/test_ai_analyzer.py CHANGED Viewed

@@ -3,7 +3,7 @@ Tests du pipeline d'analyse IA :
   - prompt_loader  : chargement + rendu des templates
   - client_factory : construction du genai.Client selon le provider
   - response_parser: parsing JSON brut → layout + OCRResult
-  - master_writer  : écriture gemini_raw.json et master.json
   - analyzer       : run_primary_analysis (end-to-end mocké)
 """
 # 1. stdlib
@@ -31,7 +31,7 @@ from app.schemas.model_config import ModelConfig, ProviderType
 from app.schemas.page_master import OCRResult, PageMaster
 from app.services.ai.analyzer import run_primary_analysis
 from app.services.ai.client_factory import build_client
-from app.services.ai.master_writer import write_gemini_raw, write_master_json
 from app.services.ai.prompt_loader import load_and_render_prompt
 from app.services.ai.response_parser import ParseError, parse_ai_response
@@ -390,35 +390,35 @@ def test_parse_empty_regions_list():
 # ---------------------------------------------------------------------------
-# Tests — write_gemini_raw / write_master_json
 # ---------------------------------------------------------------------------
-def test_write_gemini_raw_creates_file(tmp_path):
-    out = tmp_path / "page" / "gemini_raw.json"
-    write_gemini_raw("raw AI text here", out)
     assert out.exists()
-def test_write_gemini_raw_valid_json(tmp_path):
-    out = tmp_path / "gemini_raw.json"
-    write_gemini_raw('{"not": "valid json from AI"}', out)
     content = json.loads(out.read_text(encoding="utf-8"))
     assert "response_text" in content
     assert content["response_text"] == '{"not": "valid json from AI"}'
-def test_write_gemini_raw_creates_parent_dirs(tmp_path):
-    out = tmp_path / "deep" / "nested" / "dir" / "gemini_raw.json"
-    write_gemini_raw("text", out)
     assert out.exists()
-def test_write_gemini_raw_with_non_json_text(tmp_path):
-    """Même si le texte brut est invalide, gemini_raw.json est créé."""
-    out = tmp_path / "gemini_raw.json"
-    write_gemini_raw("this is not json at all", out)
     content = json.loads(out.read_text(encoding="utf-8"))
     assert content["response_text"] == "this is not json at all"
@@ -432,7 +432,7 @@ def _make_page_master() -> PageMaster:
         folio_label="0001r",
         sequence=1,
         image={
-            "original_url": "https://example.com/img.jpg",
             "derivative_web": "/data/deriv.jpg",
             "thumbnail": "/data/thumb.jpg",
             "width": 1500,
@@ -440,10 +440,11 @@ def _make_page_master() -> PageMaster:
         },
         layout={"regions": []},
         processing={
             "model_id": "gemini-2.0-flash",
             "model_display_name": "Gemini 2.0 Flash",
             "prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
-            "raw_response_path": "/data/gemini_raw.json",
             "processed_at": datetime.now(tz=timezone.utc),
         },
     )
@@ -568,12 +569,12 @@ def test_run_primary_analysis_files_created(tmp_path):
         )
     page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
-    assert (page_dir / "gemini_raw.json").exists()
     assert (page_dir / "master.json").exists()
 def test_run_primary_analysis_raw_written_before_parse(tmp_path):
-    """gemini_raw.json est écrit AVANT que le parsing échoue (R05)."""
     prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
     _setup_prompt_file(tmp_path, prompt_rel)
     deriv_path = _setup_derivative(tmp_path)
@@ -596,8 +597,8 @@ def test_run_primary_analysis_raw_written_before_parse(tmp_path):
                 project_root=tmp_path,
             )
-    # gemini_raw.json existe malgré l'échec de parsing
-    raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "gemini_raw.json"
     assert raw_path.exists()
     # master.json N'existe PAS (parsing a ��choué)
@@ -663,9 +664,9 @@ def test_run_primary_analysis_image_dict(tmp_path):
             project_root=tmp_path,
         )
-    assert result.image["original_url"] == image_info.original_url
-    assert result.image["width"] == image_info.derivative_width
-    assert result.image["height"] == image_info.derivative_height
 def test_run_primary_analysis_regions_in_layout(tmp_path):

   - prompt_loader  : chargement + rendu des templates
   - client_factory : construction du genai.Client selon le provider
   - response_parser: parsing JSON brut → layout + OCRResult
+  - master_writer  : écriture ai_raw.json et master.json
   - analyzer       : run_primary_analysis (end-to-end mocké)
 """
 # 1. stdlib
 from app.schemas.page_master import OCRResult, PageMaster
 from app.services.ai.analyzer import run_primary_analysis
 from app.services.ai.client_factory import build_client
+from app.services.ai.master_writer import write_ai_raw, write_master_json
 from app.services.ai.prompt_loader import load_and_render_prompt
 from app.services.ai.response_parser import ParseError, parse_ai_response
 # ---------------------------------------------------------------------------
+# Tests — write_ai_raw / write_master_json
 # ---------------------------------------------------------------------------
+def test_write_ai_raw_creates_file(tmp_path):
+    out = tmp_path / "page" / "ai_raw.json"
+    write_ai_raw("raw AI text here", out)
     assert out.exists()
+def test_write_ai_raw_valid_json(tmp_path):
+    out = tmp_path / "ai_raw.json"
+    write_ai_raw('{"not": "valid json from AI"}', out)
     content = json.loads(out.read_text(encoding="utf-8"))
     assert "response_text" in content
     assert content["response_text"] == '{"not": "valid json from AI"}'
+def test_write_ai_raw_creates_parent_dirs(tmp_path):
+    out = tmp_path / "deep" / "nested" / "dir" / "ai_raw.json"
+    write_ai_raw("text", out)
     assert out.exists()
+def test_write_ai_raw_with_non_json_text(tmp_path):
+    """Même si le texte brut est invalide, ai_raw.json est créé."""
+    out = tmp_path / "ai_raw.json"
+    write_ai_raw("this is not json at all", out)
     content = json.loads(out.read_text(encoding="utf-8"))
     assert content["response_text"] == "this is not json at all"
         folio_label="0001r",
         sequence=1,
         image={
+            "master": "https://example.com/img.jpg",
             "derivative_web": "/data/deriv.jpg",
             "thumbnail": "/data/thumb.jpg",
             "width": 1500,
         },
         layout={"regions": []},
         processing={
+            "provider": "google_ai_studio",
             "model_id": "gemini-2.0-flash",
             "model_display_name": "Gemini 2.0 Flash",
             "prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
+            "raw_response_path": "/data/ai_raw.json",
             "processed_at": datetime.now(tz=timezone.utc),
         },
     )
         )
     page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
+    assert (page_dir / "ai_raw.json").exists()
     assert (page_dir / "master.json").exists()
 def test_run_primary_analysis_raw_written_before_parse(tmp_path):
+    """ai_raw.json est écrit AVANT que le parsing échoue (R05)."""
     prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
     _setup_prompt_file(tmp_path, prompt_rel)
     deriv_path = _setup_derivative(tmp_path)
                 project_root=tmp_path,
             )
+    # ai_raw.json existe malgré l'échec de parsing
+    raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "ai_raw.json"
     assert raw_path.exists()
     # master.json N'existe PAS (parsing a ��choué)
             project_root=tmp_path,
         )
+    assert result.image.master == image_info.original_url
+    assert result.image.width == image_info.derivative_width
+    assert result.image.height == image_info.derivative_height
 def test_run_primary_analysis_regions_in_layout(tmp_path):

backend/tests/test_api_corrections.py CHANGED Viewed

@@ -75,7 +75,7 @@ def _make_master(
         "manuscript_id": "ms-test",
         "folio_label": "f001r",
         "sequence": 1,
-        "image": {"original_url": "https://example.com/f.jpg", "width": 1500, "height": 2000},
         "layout": {"regions": []},
         "ocr": {
             "diplomatic_text": "Incipit liber primus",
@@ -238,13 +238,13 @@ async def test_corrections_archives_old_version(async_client, db_session, monkey
     ms = await _create_manuscript(db_session, corpus.id)
     page = await _create_page(db_session, ms.id)
-    written_paths: list[str] = []
     monkeypatch.setattr(Path, "exists", lambda self: True)
     monkeypatch.setattr(Path, "read_text", lambda self, **kw: _make_master(page.id, version=1))
     def _capture_write(self: Path, content: str, **kw: object) -> None:
-        written_paths.append(str(self))
     monkeypatch.setattr(Path, "write_text", _capture_write)
@@ -254,10 +254,17 @@ async def test_corrections_archives_old_version(async_client, db_session, monkey
     )
     # Deux écritures attendues : master_v1.json (archive) + master.json (nouveau)
     assert len(written_paths) >= 2
     assert any("master_v1.json" in p for p in written_paths)
     assert any("master.json" in p and "master_v" not in p for p in written_paths)
 @pytest.mark.asyncio
 async def test_corrections_multiple_fields(async_client, db_session, monkeypatch):

         "manuscript_id": "ms-test",
         "folio_label": "f001r",
         "sequence": 1,
+        "image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
         "layout": {"regions": []},
         "ocr": {
             "diplomatic_text": "Incipit liber primus",
     ms = await _create_manuscript(db_session, corpus.id)
     page = await _create_page(db_session, ms.id)
+    written_data: dict[str, str] = {}
     monkeypatch.setattr(Path, "exists", lambda self: True)
     monkeypatch.setattr(Path, "read_text", lambda self, **kw: _make_master(page.id, version=1))
     def _capture_write(self: Path, content: str, **kw: object) -> None:
+        written_data[str(self)] = content
     monkeypatch.setattr(Path, "write_text", _capture_write)
     )
     # Deux écritures attendues : master_v1.json (archive) + master.json (nouveau)
+    written_paths = list(written_data.keys())
     assert len(written_paths) >= 2
     assert any("master_v1.json" in p for p in written_paths)
     assert any("master.json" in p and "master_v" not in p for p in written_paths)
+    # Vérifier que l'archive contient bien la version originale (v1)
+    import json as _json
+    archive_path = next(p for p in written_paths if "master_v1.json" in p)
+    archive_data = _json.loads(written_data[archive_path])
+    assert archive_data["editorial"]["version"] == 1
 @pytest.mark.asyncio
 async def test_corrections_multiple_fields(async_client, db_session, monkeypatch):

backend/tests/test_api_export.py CHANGED Viewed

@@ -83,7 +83,7 @@ def _make_master_json(page_id: str, folio_label: str, sequence: int) -> str:
         "folio_label": folio_label,
         "sequence": sequence,
         "image": {
-            "original_url": f"https://example.com/{page_id}.jpg",
             "derivative_web": f"/data/deriv/{page_id}.jpg",
             "thumbnail": f"/data/thumb/{page_id}.jpg",
             "width": 1500,

         "folio_label": folio_label,
         "sequence": sequence,
         "image": {
+            "master": f"https://example.com/{page_id}.jpg",
             "derivative_web": f"/data/deriv/{page_id}.jpg",
             "thumbnail": f"/data/thumb/{page_id}.jpg",
             "width": 1500,

backend/tests/test_api_ingest.py CHANGED Viewed

@@ -457,6 +457,15 @@ async def test_reingest_manifest_skips_existing_pages(async_client, db_session,
     assert data2["pages_created"] == 0
     assert data2["pages_skipped"] == 2
 @pytest.mark.asyncio
 async def test_reingest_images_skips_existing_pages(async_client, db_session):

     assert data2["pages_created"] == 0
     assert data2["pages_skipped"] == 2
+    # Vérifier que la BDD n'a bien que 2 pages (pas de doublons)
+    from sqlalchemy import select as sa_select
+    from app.models.corpus import PageModel
+    page_result = await db_session.execute(
+        sa_select(PageModel).where(PageModel.manuscript_id == data1["manuscript_id"])
+    )
+    pages_in_db = list(page_result.scalars().all())
+    assert len(pages_in_db) == 2
 @pytest.mark.asyncio
 async def test_reingest_images_skips_existing_pages(async_client, db_session):

backend/tests/test_api_models.py CHANGED Viewed

@@ -94,7 +94,7 @@ async def test_get_models_endpoint_removed(async_client):
 @pytest.mark.asyncio
 async def test_refresh_models_ok(async_client, monkeypatch):
     monkeypatch.setattr(
-        models_api_module, "list_all_models", lambda: _MOCK_MODELS
     )
     response = await async_client.post("/api/v1/models/refresh")
     assert response.status_code == 200
@@ -103,7 +103,7 @@ async def test_refresh_models_ok(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_refresh_models_has_timestamp(async_client, monkeypatch):
     monkeypatch.setattr(
-        models_api_module, "list_all_models", lambda: _MOCK_MODELS
     )
     data = (await async_client.post("/api/v1/models/refresh")).json()
     assert "refreshed_at" in data
@@ -113,7 +113,7 @@ async def test_refresh_models_has_timestamp(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_refresh_models_count(async_client, monkeypatch):
     monkeypatch.setattr(
-        models_api_module, "list_all_models", lambda: _MOCK_MODELS
     )
     data = (await async_client.post("/api/v1/models/refresh")).json()
     assert data["count"] == 2
@@ -123,7 +123,7 @@ async def test_refresh_models_count(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_refresh_models_structure(async_client, monkeypatch):
     monkeypatch.setattr(
-        models_api_module, "list_all_models", lambda: _MOCK_MODELS
     )
     data = (await async_client.post("/api/v1/models/refresh")).json()
     assert "models" in data

 @pytest.mark.asyncio
 async def test_refresh_models_ok(async_client, monkeypatch):
     monkeypatch.setattr(
+        "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
     )
     response = await async_client.post("/api/v1/models/refresh")
     assert response.status_code == 200
 @pytest.mark.asyncio
 async def test_refresh_models_has_timestamp(async_client, monkeypatch):
     monkeypatch.setattr(
+        "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
     )
     data = (await async_client.post("/api/v1/models/refresh")).json()
     assert "refreshed_at" in data
 @pytest.mark.asyncio
 async def test_refresh_models_count(async_client, monkeypatch):
     monkeypatch.setattr(
+        "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
     )
     data = (await async_client.post("/api/v1/models/refresh")).json()
     assert data["count"] == 2
 @pytest.mark.asyncio
 async def test_refresh_models_structure(async_client, monkeypatch):
     monkeypatch.setattr(
+        "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
     )
     data = (await async_client.post("/api/v1/models/refresh")).json()
     assert "models" in data

backend/tests/test_api_pages.py CHANGED Viewed

@@ -87,7 +87,7 @@ def _make_master_json(page_id: str, corpus_profile: str = "medieval-illuminated"
         "folio_label": "f001r",
         "sequence": 1,
         "image": {
-            "original_url": "https://example.com/f001r.jpg",
             "derivative_web": "/data/deriv/f001r.jpg",
             "thumbnail": "/data/thumb/f001r.jpg",
             "width": 1500,

         "folio_label": "f001r",
         "sequence": 1,
         "image": {
+            "master": "https://example.com/f001r.jpg",
             "derivative_web": "/data/deriv/f001r.jpg",
             "thumbnail": "/data/thumb/f001r.jpg",
             "width": 1500,

backend/tests/test_api_providers.py CHANGED Viewed

@@ -90,7 +90,7 @@ _MOCK_MISTRAL_MODELS = [
 @pytest.mark.asyncio
 async def test_list_providers_returns_list(async_client, monkeypatch):
-    monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     resp = await async_client.get("/api/v1/providers")
     assert resp.status_code == 200
     assert isinstance(resp.json(), list)
@@ -98,14 +98,14 @@ async def test_list_providers_returns_list(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_list_providers_count(async_client, monkeypatch):
-    monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     data = (await async_client.get("/api/v1/providers")).json()
     assert len(data) == 4  # 4 providers connus
 @pytest.mark.asyncio
 async def test_list_providers_fields(async_client, monkeypatch):
-    monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     data = (await async_client.get("/api/v1/providers")).json()
     p = data[0]
     assert "provider_type" in p
@@ -116,7 +116,7 @@ async def test_list_providers_fields(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_list_providers_all_unavailable(async_client, monkeypatch):
-    monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     data = (await async_client.get("/api/v1/providers")).json()
     assert all(not p["available"] for p in data)
     assert all(p["model_count"] == 0 for p in data)
@@ -124,7 +124,7 @@ async def test_list_providers_all_unavailable(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_list_providers_google_available(async_client, monkeypatch):
-    monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_GOOGLE_ONLY)
     data = (await async_client.get("/api/v1/providers")).json()
     google = next(p for p in data if p["provider_type"] == "google_ai_studio")
     assert google["available"] is True
@@ -133,7 +133,7 @@ async def test_list_providers_google_available(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_list_providers_mistral_available(async_client, monkeypatch):
-    monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_GOOGLE_AND_MISTRAL)
     data = (await async_client.get("/api/v1/providers")).json()
     mistral = next(p for p in data if p["provider_type"] == "mistral")
     assert mistral["available"] is True
@@ -143,7 +143,7 @@ async def test_list_providers_mistral_available(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
     """Mistral est toujours dans la liste même si indisponible."""
-    monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     data = (await async_client.get("/api/v1/providers")).json()
     types_ = [p["provider_type"] for p in data]
     assert "mistral" in types_
@@ -156,7 +156,7 @@ async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_get_provider_models_google(async_client, monkeypatch):
     monkeypatch.setattr(
-        models_api_module, "list_models_for_provider", lambda ptype: _MOCK_GOOGLE_MODELS
     )
     resp = await async_client.get("/api/v1/providers/google_ai_studio/models")
     assert resp.status_code == 200
@@ -166,7 +166,7 @@ async def test_get_provider_models_google(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_get_provider_models_mistral(async_client, monkeypatch):
     monkeypatch.setattr(
-        models_api_module, "list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
     )
     resp = await async_client.get("/api/v1/providers/mistral/models")
     assert resp.status_code == 200
@@ -189,7 +189,7 @@ async def test_get_provider_models_not_configured(async_client, monkeypatch):
     def _raise(ptype):
         raise RuntimeError("Variable d'environnement manquante : MISTRAL_API_KEY")
-    monkeypatch.setattr(models_api_module, "list_models_for_provider", _raise)
     resp = await async_client.get("/api/v1/providers/mistral/models")
     assert resp.status_code == 503
@@ -197,7 +197,7 @@ async def test_get_provider_models_not_configured(async_client, monkeypatch):
 @pytest.mark.asyncio
 async def test_get_provider_models_fields(async_client, monkeypatch):
     monkeypatch.setattr(
-        models_api_module, "list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
     )
     data = (await async_client.get("/api/v1/providers/mistral/models")).json()
     m = data[0]

 @pytest.mark.asyncio
 async def test_list_providers_returns_list(async_client, monkeypatch):
+    monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     resp = await async_client.get("/api/v1/providers")
     assert resp.status_code == 200
     assert isinstance(resp.json(), list)
 @pytest.mark.asyncio
 async def test_list_providers_count(async_client, monkeypatch):
+    monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     data = (await async_client.get("/api/v1/providers")).json()
     assert len(data) == 4  # 4 providers connus
 @pytest.mark.asyncio
 async def test_list_providers_fields(async_client, monkeypatch):
+    monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     data = (await async_client.get("/api/v1/providers")).json()
     p = data[0]
     assert "provider_type" in p
 @pytest.mark.asyncio
 async def test_list_providers_all_unavailable(async_client, monkeypatch):
+    monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     data = (await async_client.get("/api/v1/providers")).json()
     assert all(not p["available"] for p in data)
     assert all(p["model_count"] == 0 for p in data)
 @pytest.mark.asyncio
 async def test_list_providers_google_available(async_client, monkeypatch):
+    monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_GOOGLE_ONLY)
     data = (await async_client.get("/api/v1/providers")).json()
     google = next(p for p in data if p["provider_type"] == "google_ai_studio")
     assert google["available"] is True
 @pytest.mark.asyncio
 async def test_list_providers_mistral_available(async_client, monkeypatch):
+    monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_GOOGLE_AND_MISTRAL)
     data = (await async_client.get("/api/v1/providers")).json()
     mistral = next(p for p in data if p["provider_type"] == "mistral")
     assert mistral["available"] is True
 @pytest.mark.asyncio
 async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
     """Mistral est toujours dans la liste même si indisponible."""
+    monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
     data = (await async_client.get("/api/v1/providers")).json()
     types_ = [p["provider_type"] for p in data]
     assert "mistral" in types_
 @pytest.mark.asyncio
 async def test_get_provider_models_google(async_client, monkeypatch):
     monkeypatch.setattr(
+        "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_GOOGLE_MODELS
     )
     resp = await async_client.get("/api/v1/providers/google_ai_studio/models")
     assert resp.status_code == 200
 @pytest.mark.asyncio
 async def test_get_provider_models_mistral(async_client, monkeypatch):
     monkeypatch.setattr(
+        "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
     )
     resp = await async_client.get("/api/v1/providers/mistral/models")
     assert resp.status_code == 200
     def _raise(ptype):
         raise RuntimeError("Variable d'environnement manquante : MISTRAL_API_KEY")
+    monkeypatch.setattr("app.services.ai.model_registry.list_models_for_provider", _raise)
     resp = await async_client.get("/api/v1/providers/mistral/models")
     assert resp.status_code == 503
 @pytest.mark.asyncio
 async def test_get_provider_models_fields(async_client, monkeypatch):
     monkeypatch.setattr(
+        "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
     )
     data = (await async_client.get("/api/v1/providers/mistral/models")).json()
     m = data[0]

backend/tests/test_api_search.py CHANGED Viewed

@@ -33,7 +33,7 @@ def _make_master(page_id: str, diplomatic_text: str = "", translation_fr: str =
         "manuscript_id": "ms-test",
         "folio_label": "f001r",
         "sequence": 1,
-        "image": {"original_url": "https://example.com/f.jpg", "width": 1500, "height": 2000},
         "layout": {"regions": []},
         "ocr": {
             "diplomatic_text": diplomatic_text,

         "manuscript_id": "ms-test",
         "folio_label": "f001r",
         "sequence": 1,
+        "image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
         "layout": {"regions": []},
         "ocr": {
             "diplomatic_text": diplomatic_text,

backend/tests/test_export_alto.py CHANGED Viewed

@@ -52,10 +52,11 @@ def _make_master(
     processing = None
     if with_processing:
         processing = ProcessingInfo(
             model_id="gemini-2.0-flash",
             model_display_name="Gemini 2.0 Flash",
             prompt_version="prompts/medieval-illuminated/primary_v1.txt",
-            raw_response_path="/data/gemini_raw.json",
             processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
         )
     return PageMaster(
@@ -65,7 +66,7 @@ def _make_master(
         folio_label="0001r",
         sequence=sequence,
         image={
-            "original_url": "https://example.com/img.jpg",
             "derivative_web": "/data/deriv.jpg",
             "thumbnail": "/data/thumb.jpg",
             "width": width,

     processing = None
     if with_processing:
         processing = ProcessingInfo(
+            provider="google_ai_studio",
             model_id="gemini-2.0-flash",
             model_display_name="Gemini 2.0 Flash",
             prompt_version="prompts/medieval-illuminated/primary_v1.txt",
+            raw_response_path="/data/ai_raw.json",
             processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
         )
     return PageMaster(
         folio_label="0001r",
         sequence=sequence,
         image={
+            "master": "https://example.com/img.jpg",
             "derivative_web": "/data/deriv.jpg",
             "thumbnail": "/data/thumb.jpg",
             "width": width,

backend/tests/test_export_iiif.py CHANGED Viewed

@@ -53,7 +53,7 @@ def _make_page(
         folio_label=folio_label,
         sequence=sequence,
         image={
-            "original_url": original_url or f"https://example.com/{folio_label}.jpg",
             "derivative_web": f"/data/deriv/{folio_label}.jpg",
             "thumbnail": f"/data/thumb/{folio_label}.jpg",
             "width": width,
@@ -193,11 +193,11 @@ def test_manifest_label_uses_language_key(simple_manifest):
 def test_manifest_label_without_language_uses_none():
-    """Sans champ language, la clé de label est 'none'."""
     pages = [_make_page("ms-0001r", "0001r", 1)]
-    meta = _base_meta()  # pas de language
     manifest = generate_manifest(pages, meta, _BASE_URL)
-    assert "none" in manifest["label"]
 def test_manifest_label_fr(chroniques_pages, chroniques_meta):
@@ -272,7 +272,7 @@ def test_canvas_order_respects_sequence():
         _make_page("ms-f002r", "f002r", 2),
     ]
     manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
-    labels = [c["label"]["none"][0] for c in manifest["items"]]
     assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
@@ -283,7 +283,7 @@ def test_canvas_order_large_sequence():
     random.shuffle(pages)
     manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
     sequences_in_label = [
-        int(c["label"]["none"][0].replace("Folio f", "").replace("r", ""))
         for c in manifest["items"]
     ]
     assert sequences_in_label == list(range(1, 11))
@@ -344,7 +344,7 @@ def test_canvas_width_matches_image(beatus_pages, beatus_meta):
         # Trouve la page correspondante
         page_id = canvas["id"].split("/canvas/")[-1]
         page = next(p for p in beatus_pages if p.page_id == page_id)
-        assert canvas["width"] == page.image["width"]
 def test_canvas_height_matches_image(beatus_pages, beatus_meta):
@@ -352,7 +352,7 @@ def test_canvas_height_matches_image(beatus_pages, beatus_meta):
     for canvas in manifest["items"]:
         page_id = canvas["id"].split("/canvas/")[-1]
         page = next(p for p in beatus_pages if p.page_id == page_id)
-        assert canvas["height"] == page.image["height"]
 def test_canvas_dimensions_beatus_hr():
@@ -447,7 +447,7 @@ def test_annotation_body_id_is_original_url(beatus_pages, beatus_meta):
         page_id = canvas["id"].split("/canvas/")[-1]
         page = next(p for p in beatus_pages if p.page_id == page_id)
         body = canvas["items"][0]["items"][0]["body"]
-        assert body["id"] == page.image["original_url"]
 def test_annotation_body_contains_gallica_url(beatus_pages, beatus_meta):
@@ -480,7 +480,10 @@ def test_base_url_trailing_slash_stripped():
     """Un base_url avec slash final ne génère pas de double slash dans les IDs."""
     pages = [_make_page("ms-0001r", "0001r", 1)]
     manifest = generate_manifest(pages, _base_meta(), "https://example.com/")
-    assert "//" not in manifest["id"].replace("://", "X")
 # ---------------------------------------------------------------------------

         folio_label=folio_label,
         sequence=sequence,
         image={
+            "master": original_url or f"https://example.com/{folio_label}.jpg",
             "derivative_web": f"/data/deriv/{folio_label}.jpg",
             "thumbnail": f"/data/thumb/{folio_label}.jpg",
             "width": width,
 def test_manifest_label_without_language_uses_none():
+    """Sans champ language, la clé de label est 'en' (défaut IIIF-compliant)."""
     pages = [_make_page("ms-0001r", "0001r", 1)]
+    meta = _base_meta()  # pas de language → défaut "en"
     manifest = generate_manifest(pages, meta, _BASE_URL)
+    assert "en" in manifest["label"]
 def test_manifest_label_fr(chroniques_pages, chroniques_meta):
         _make_page("ms-f002r", "f002r", 2),
     ]
     manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
+    labels = [c["label"]["en"][0] for c in manifest["items"]]
     assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
     random.shuffle(pages)
     manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
     sequences_in_label = [
+        int(c["label"]["en"][0].replace("Folio f", "").replace("r", ""))
         for c in manifest["items"]
     ]
     assert sequences_in_label == list(range(1, 11))
         # Trouve la page correspondante
         page_id = canvas["id"].split("/canvas/")[-1]
         page = next(p for p in beatus_pages if p.page_id == page_id)
+        assert canvas["width"] == page.image.width
 def test_canvas_height_matches_image(beatus_pages, beatus_meta):
     for canvas in manifest["items"]:
         page_id = canvas["id"].split("/canvas/")[-1]
         page = next(p for p in beatus_pages if p.page_id == page_id)
+        assert canvas["height"] == page.image.height
 def test_canvas_dimensions_beatus_hr():
         page_id = canvas["id"].split("/canvas/")[-1]
         page = next(p for p in beatus_pages if p.page_id == page_id)
         body = canvas["items"][0]["items"][0]["body"]
+        assert body["id"] == page.image.master
 def test_annotation_body_contains_gallica_url(beatus_pages, beatus_meta):
     """Un base_url avec slash final ne génère pas de double slash dans les IDs."""
     pages = [_make_page("ms-0001r", "0001r", 1)]
     manifest = generate_manifest(pages, _base_meta(), "https://example.com/")
+    manifest_id = manifest["id"]
+    # Retirer le protocole puis vérifier qu'il n'y a pas de double slash
+    without_protocol = manifest_id.split("://", 1)[1]
+    assert "//" not in without_protocol
 # ---------------------------------------------------------------------------

backend/tests/test_export_mets.py CHANGED Viewed

@@ -66,10 +66,11 @@ def _make_page(
     processing = None
     if with_processing:
         processing = ProcessingInfo(
             model_id="gemini-2.0-flash",
             model_display_name="Gemini 2.0 Flash",
             prompt_version="prompts/medieval-illuminated/primary_v1.txt",
-            raw_response_path=f"/data/corpora/test/pages/{folio_label}/gemini_raw.json",
             processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
         )
     ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
@@ -80,7 +81,7 @@ def _make_page(
         folio_label=folio_label,
         sequence=sequence,
         image={
-            "original_url": original_url or f"https://example.com/{folio_label}.jpg",
             "derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg",
             "thumbnail": f"/data/thumb/{folio_label}.jpg",
             "width": 1500,
@@ -194,7 +195,9 @@ def test_generate_mets_namespace(beatus_pages, beatus_meta):
 def test_generate_mets_objid(beatus_pages, beatus_meta):
     root = _parse(generate_mets(beatus_pages, beatus_meta))
-    assert root.get("OBJID") == "BnF-Latin-8878"
 def test_generate_mets_label(beatus_pages, beatus_meta):

     processing = None
     if with_processing:
         processing = ProcessingInfo(
+            provider="google_ai_studio",
             model_id="gemini-2.0-flash",
             model_display_name="Gemini 2.0 Flash",
             prompt_version="prompts/medieval-illuminated/primary_v1.txt",
+            raw_response_path=f"/data/corpora/test/pages/{folio_label}/ai_raw.json",
             processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
         )
     ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
         folio_label=folio_label,
         sequence=sequence,
         image={
+            "master": original_url or f"https://example.com/{folio_label}.jpg",
             "derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg",
             "thumbnail": f"/data/thumb/{folio_label}.jpg",
             "width": 1500,
 def test_generate_mets_objid(beatus_pages, beatus_meta):
     root = _parse(generate_mets(beatus_pages, beatus_meta))
+    objid = root.get("OBJID")
+    assert objid is not None, "OBJID attribute absent du root mets"
+    assert objid == "BnF-Latin-8878"
 def test_generate_mets_label(beatus_pages, beatus_meta):

backend/tests/test_image_pipeline.py CHANGED Viewed

@@ -278,7 +278,6 @@ def test_fetch_iiif_image_success():
                 "+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
             ),
             "Accept": "image/jpeg,image/png,image/*,*/*",
-            "Referer": "https://gallica.bnf.fr/",
         },
         follow_redirects=True,
         timeout=60.0,

                 "+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
             ),
             "Accept": "image/jpeg,image/png,image/*,*/*",
         },
         follow_redirects=True,
         timeout=60.0,

backend/tests/test_job_runner.py CHANGED Viewed

@@ -142,16 +142,24 @@ def _page_master(page_id: str, ms_id: str) -> PageMaster:
 def _apply_success_mocks(monkeypatch, page_id: str, ms_id: str) -> None:
-    """Applique les mocks IO pour un pipeline réussi."""
     monkeypatch.setattr(
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
-        job_runner_module, "run_primary_analysis",
         lambda **kw: _page_master(page_id, ms_id),
     )
-    monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "<alto/>")
-    monkeypatch.setattr(job_runner_module, "write_alto", lambda xml, path: None)
 # ---------------------------------------------------------------------------
@@ -274,7 +282,7 @@ async def test_no_image_path_job_failed(db, setup_with_model, monkeypatch):
     s["page"].image_master_path = None
     await db.commit()
     monkeypatch.setattr(
-        job_runner_module, "run_primary_analysis",
         lambda **kw: _page_master(s["page"].id, s["ms"].id),
     )
@@ -291,7 +299,7 @@ async def test_no_image_path_page_error(db, setup_with_model, monkeypatch):
     s["page"].image_master_path = None
     await db.commit()
     monkeypatch.setattr(
-        job_runner_module, "run_primary_analysis",
         lambda **kw: _page_master(s["page"].id, s["ms"].id),
     )
@@ -343,7 +351,7 @@ async def test_primary_analysis_fails_job_failed(db, setup_with_model, monkeypat
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
-        job_runner_module, "run_primary_analysis",
         lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
     )
@@ -361,7 +369,7 @@ async def test_primary_analysis_fails_page_error(db, setup_with_model, monkeypat
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
-        job_runner_module, "run_primary_analysis",
         lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
     )
@@ -379,7 +387,7 @@ async def test_primary_analysis_error_message_stored(db, setup_with_model, monke
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
-        job_runner_module, "run_primary_analysis",
         lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
     )
@@ -401,12 +409,14 @@ async def test_write_alto_fails_job_failed(db, setup_with_model, monkeypatch):
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
-        job_runner_module, "run_primary_analysis",
         lambda **kw: _page_master(s["page"].id, s["ms"].id),
     )
-    monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "<alto/>")
     monkeypatch.setattr(
-        job_runner_module, "write_alto",
         lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
     )
@@ -424,12 +434,14 @@ async def test_write_alto_fails_page_error(db, setup_with_model, monkeypatch):
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
-        job_runner_module, "run_primary_analysis",
         lambda **kw: _page_master(s["page"].id, s["ms"].id),
     )
-    monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "<alto/>")
     monkeypatch.setattr(
-        job_runner_module, "write_alto",
         lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
     )
@@ -534,7 +546,7 @@ async def test_corpus_runner_calls_execute_per_job(monkeypatch):
         return _FakeSession()
     monkeypatch.setattr(corpus_runner_module, "async_session_factory", _mock_factory)
-    monkeypatch.setattr(corpus_runner_module, "execute_page_job", _mock_execute)
     await execute_corpus_job("corpus-xyz")

 def _apply_success_mocks(monkeypatch, page_id: str, ms_id: str) -> None:
+    """Applique les mocks IO pour un pipeline réussi.
+    Les imports sont différés dans job_runner (lazy imports). On patche donc
+    les modules sources pour que le import dans la fonction cible récupère le mock.
+    """
     monkeypatch.setattr(
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
+        "app.services.ai.analyzer.run_primary_analysis",
         lambda **kw: _page_master(page_id, ms_id),
     )
+    monkeypatch.setattr(
+        "app.services.export.alto.generate_alto", lambda pm: "<alto/>"
+    )
+    monkeypatch.setattr(
+        "app.services.export.alto.write_alto", lambda xml, path: None
+    )
 # ---------------------------------------------------------------------------
     s["page"].image_master_path = None
     await db.commit()
     monkeypatch.setattr(
+        "app.services.ai.analyzer.run_primary_analysis",
         lambda **kw: _page_master(s["page"].id, s["ms"].id),
     )
     s["page"].image_master_path = None
     await db.commit()
     monkeypatch.setattr(
+        "app.services.ai.analyzer.run_primary_analysis",
         lambda **kw: _page_master(s["page"].id, s["ms"].id),
     )
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
+        "app.services.ai.analyzer.run_primary_analysis",
         lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
     )
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
+        "app.services.ai.analyzer.run_primary_analysis",
         lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
     )
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
+        "app.services.ai.analyzer.run_primary_analysis",
         lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
     )
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
+        "app.services.ai.analyzer.run_primary_analysis",
         lambda **kw: _page_master(s["page"].id, s["ms"].id),
     )
     monkeypatch.setattr(
+        "app.services.export.alto.generate_alto", lambda pm: "<alto/>"
+    )
+    monkeypatch.setattr(
+        "app.services.export.alto.write_alto",
         lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
     )
         job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
     )
     monkeypatch.setattr(
+        "app.services.ai.analyzer.run_primary_analysis",
         lambda **kw: _page_master(s["page"].id, s["ms"].id),
     )
     monkeypatch.setattr(
+        "app.services.export.alto.generate_alto", lambda pm: "<alto/>"
+    )
+    monkeypatch.setattr(
+        "app.services.export.alto.write_alto",
         lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
     )
         return _FakeSession()
     monkeypatch.setattr(corpus_runner_module, "async_session_factory", _mock_factory)
+    monkeypatch.setattr("app.services.job_runner.execute_page_job", _mock_execute)
     await execute_corpus_job("corpus-xyz")

backend/tests/test_security.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""
+Tests de sécurité — Sprint F1.
+Vérifie que toutes les vulnérabilités identifiées sont corrigées :
+- Path traversal sur profiles, slug, folio_label, frontend serving
+- SSRF sur manifest_url
+- Validation des entrées (taille, format)
+"""
+# 1. stdlib
+import pytest
+# 2. third-party — fixtures API
+from tests.conftest_api import async_client, db_session  # noqa: F401
+# ---------------------------------------------------------------------------
+# Path traversal — profiles
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_profile_path_traversal_dotdot(async_client):
+    """Un profile_id contenant '..' doit être rejeté (400)."""
+    resp = await async_client.get("/api/v1/profiles/..passwd")
+    assert resp.status_code == 400
+@pytest.mark.asyncio
+async def test_profile_path_traversal_slash(async_client):
+    """Un profile_id avec un slash (même encodé) doit être rejeté (400 ou 404)."""
+    # FastAPI normalise les chemins, donc un slash dans l'ID ne sera pas transmis.
+    # On teste avec un ID contenant des caractères spéciaux interdits.
+    resp = await async_client.get("/api/v1/profiles/UPPER_CASE")
+    assert resp.status_code == 400
+@pytest.mark.asyncio
+async def test_profile_path_traversal_special_chars(async_client):
+    """Un profile_id avec des caractères spéciaux doit être rejeté."""
+    resp = await async_client.get("/api/v1/profiles/test@profile")
+    assert resp.status_code == 400
+@pytest.mark.asyncio
+async def test_profile_valid_id_not_found(async_client):
+    """Un profile_id valide mais inexistant retourne 404 (pas 400)."""
+    resp = await async_client.get("/api/v1/profiles/does-not-exist")
+    assert resp.status_code == 404
+# ---------------------------------------------------------------------------
+# Path traversal — corpus slug
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_corpus_slug_path_traversal(async_client):
+    """Un slug avec ../ doit être rejeté par la validation Pydantic."""
+    resp = await async_client.post("/api/v1/corpora", json={
+        "slug": "../../malicious",
+        "title": "Test",
+        "profile_id": "medieval-illuminated",
+    })
+    assert resp.status_code == 422
+@pytest.mark.asyncio
+async def test_corpus_slug_with_spaces(async_client):
+    """Un slug avec des espaces doit être rejeté."""
+    resp = await async_client.post("/api/v1/corpora", json={
+        "slug": "my corpus",
+        "title": "Test",
+        "profile_id": "medieval-illuminated",
+    })
+    assert resp.status_code == 422
+@pytest.mark.asyncio
+async def test_corpus_slug_uppercase(async_client):
+    """Un slug avec des majuscules doit être rejeté (lowercase only)."""
+    resp = await async_client.post("/api/v1/corpora", json={
+        "slug": "MyCorpus",
+        "title": "Test",
+        "profile_id": "medieval-illuminated",
+    })
+    assert resp.status_code == 422
+@pytest.mark.asyncio
+async def test_corpus_slug_valid(async_client):
+    """Un slug valide doit être accepté."""
+    resp = await async_client.post("/api/v1/corpora", json={
+        "slug": "my-corpus-01",
+        "title": "Test",
+        "profile_id": "medieval-illuminated",
+    })
+    assert resp.status_code == 201
+@pytest.mark.asyncio
+async def test_corpus_slug_empty(async_client):
+    """Un slug vide doit être rejeté."""
+    resp = await async_client.post("/api/v1/corpora", json={
+        "slug": "",
+        "title": "Test",
+        "profile_id": "medieval-illuminated",
+    })
+    assert resp.status_code == 422
+@pytest.mark.asyncio
+async def test_corpus_title_too_long(async_client):
+    """Un titre trop long (>256 chars) doit être rejeté."""
+    resp = await async_client.post("/api/v1/corpora", json={
+        "slug": "test-long",
+        "title": "x" * 300,
+        "profile_id": "medieval-illuminated",
+    })
+    assert resp.status_code == 422
+# ---------------------------------------------------------------------------
+# SSRF — manifest_url
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_ssrf_localhost(async_client):
+    """Un manifest_url pointant vers localhost doit être rejeté."""
+    # Créer un corpus d'abord
+    create = await async_client.post("/api/v1/corpora", json={
+        "slug": "ssrf-test", "title": "SSRF", "profile_id": "test",
+    })
+    cid = create.json()["id"]
+    resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
+        "manifest_url": "http://localhost:8000/secret",
+    })
+    assert resp.status_code == 400
+    assert "interdit" in resp.json()["detail"].lower() or "localhost" in resp.json()["detail"].lower()
+@pytest.mark.asyncio
+async def test_ssrf_metadata_ip(async_client):
+    """Un manifest_url vers 169.254.x.x (cloud metadata) doit être rejeté."""
+    create = await async_client.post("/api/v1/corpora", json={
+        "slug": "ssrf-meta", "title": "SSRF", "profile_id": "test",
+    })
+    cid = create.json()["id"]
+    resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
+        "manifest_url": "http://169.254.169.254/latest/meta-data/",
+    })
+    assert resp.status_code == 400
+@pytest.mark.asyncio
+async def test_ssrf_file_scheme(async_client):
+    """Un manifest_url avec file:// doit être rejeté."""
+    create = await async_client.post("/api/v1/corpora", json={
+        "slug": "ssrf-file", "title": "SSRF", "profile_id": "test",
+    })
+    cid = create.json()["id"]
+    resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
+        "manifest_url": "file:///etc/passwd",
+    })
+    assert resp.status_code == 400
+# ---------------------------------------------------------------------------
+# Input validation — search
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_search_query_too_long(async_client):
+    """Une requête de recherche >500 chars doit être rejetée."""
+    resp = await async_client.get("/api/v1/search", params={"q": "x" * 501})
+    assert resp.status_code == 422
+@pytest.mark.asyncio
+async def test_search_query_max_length_ok(async_client):
+    """Une requête de recherche de 500 chars doit être acceptée (0 résultat)."""
+    resp = await async_client.get("/api/v1/search", params={"q": "x" * 500})
+    assert resp.status_code == 200
+# ---------------------------------------------------------------------------
+# Input validation — model selection
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_model_id_too_long(async_client):
+    """Un model_id >256 chars doit être rejeté."""
+    create = await async_client.post("/api/v1/corpora", json={
+        "slug": "model-test", "title": "T", "profile_id": "test",
+    })
+    cid = create.json()["id"]
+    resp = await async_client.put(f"/api/v1/corpora/{cid}/model", json={
+        "model_id": "x" * 300,
+        "provider_type": "google_ai_studio",
+    })
+    assert resp.status_code == 422
+# ---------------------------------------------------------------------------
+# Input validation — corrections
+# ---------------------------------------------------------------------------
+@pytest.mark.asyncio
+async def test_corrections_restore_negative_version(async_client):
+    """restore_to_version < 1 doit être rejeté."""
+    resp = await async_client.post("/api/v1/pages/fake-page/corrections", json={
+        "restore_to_version": 0,
+    })
+    assert resp.status_code == 422

frontend/src/App.tsx CHANGED Viewed

@@ -42,6 +42,7 @@ export default function App() {
       onOpenManuscript={(manuscriptId, profileId) =>
         setView({ name: 'reader', manuscriptId, profileId })
       }
       onAdmin={() => setView({ name: 'admin' })}
     />
   )

       onOpenManuscript={(manuscriptId, profileId) =>
         setView({ name: 'reader', manuscriptId, profileId })
       }
+      onOpenPage={(pageId) => setView({ name: 'editor', pageId })}
       onAdmin={() => setView({ name: 'admin' })}
     />
   )

frontend/src/lib/api.ts CHANGED Viewed

@@ -1,5 +1,13 @@
 const BASE_URL: string = import.meta.env.VITE_API_URL ?? ''
 // ── Types ─────────────────────────────────────────────────────────────────────
 export interface ProviderInfo {

 const BASE_URL: string = import.meta.env.VITE_API_URL ?? ''
+if (!BASE_URL && import.meta.env.PROD) {
+  console.warn(
+    '[Scriptorium] VITE_API_URL non défini en production. ' +
+    'Les appels API utiliseront des chemins relatifs, ce qui peut échouer ' +
+    'si le frontend n\'est pas servi par le même domaine que le backend.'
+  )
+}
 // ── Types ─────────────────────────────────────────────────────────────────────
 export interface ProviderInfo {

frontend/src/pages/Editor.tsx CHANGED Viewed

@@ -119,7 +119,7 @@ export default function Editor({ pageId, onBack }: Props) {
     return <div className="p-8 text-red-600">Erreur : {error}</div>
   }
-  const imageUrl = master ? '' : '' // image path not directly stored on PageMaster
   const regions = master?.layout?.regions ?? []
   return (

     return <div className="p-8 text-red-600">Erreur : {error}</div>
   }
+  const imageUrl = master?.image?.derivative_web ?? master?.image?.master ?? ''
   const regions = master?.layout?.regions ?? []
   return (

frontend/src/pages/Home.tsx CHANGED Viewed

@@ -10,10 +10,11 @@ import {
 interface Props {
   onOpenManuscript: (manuscriptId: string, profileId: string) => void
   onAdmin: () => void
 }
-export default function Home({ onOpenManuscript, onAdmin }: Props) {
   const [corpora, setCorpora] = useState<Corpus[]>([])
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState<string | null>(null)
@@ -73,7 +74,7 @@ export default function Home({ onOpenManuscript, onAdmin }: Props) {
           </p>
         </div>
         <div className="flex items-center gap-4">
-          <SearchBar />
           <AdminNav onClick={onAdmin} />
         </div>
       </header>

 interface Props {
   onOpenManuscript: (manuscriptId: string, profileId: string) => void
+  onOpenPage?: (pageId: string) => void
   onAdmin: () => void
 }
+export default function Home({ onOpenManuscript, onOpenPage, onAdmin }: Props) {
   const [corpora, setCorpora] = useState<Corpus[]>([])
   const [loading, setLoading] = useState(true)
   const [error, setError] = useState<string | null>(null)
           </p>
         </div>
         <div className="flex items-center gap-4">
+          <SearchBar onSelectResult={onOpenPage ? (r) => onOpenPage(r.page_id) : undefined} />
           <AdminNav onClick={onAdmin} />
         </div>
       </header>

infra/Dockerfile DELETED Viewed

@@ -1,71 +0,0 @@
-# Scriptorium AI — image de production (multi-stage)
-# Ce fichier est la copie exacte de Dockerfile (racine).
-# Build depuis la racine du dépôt :
-#   docker build -f infra/Dockerfile -t scriptorium-ai .
-#
-# Structure attendue dans l'image :
-#   /app/backend/app/   ← source Python (importable via PYTHONPATH)
-#   /app/profiles/      ← profils JSON
-#   /app/prompts/       ← templates de prompts
-#   /app/static/        ← frontend React buildé
-#   /app/data/          ← créé vide ; à monter en volume pour les artefacts
-# ── Stage 1 : build du frontend React ────────────────────────────────────────
-FROM node:20-slim AS frontend-builder
-WORKDIR /frontend
-# Installer les dépendances (cache layer séparé)
-COPY frontend/package.json ./
-RUN npm install
-# Copier les sources et builder
-COPY frontend/ ./
-RUN npm run build
-# ── Stage 2 : image Python finale ────────────────────────────────────────────
-FROM python:3.11-slim
-WORKDIR /app
-# ── Dépendances Python ─────────────────────────────────────────────────────
-# On copie uniquement pyproject.toml pour exploiter le cache de layers Docker.
-# Un stub app/__init__.py satisfait setuptools (discover packages) sans avoir
-# besoin de copier tout le code source à ce stade.
-COPY backend/pyproject.toml /tmp/build/
-RUN mkdir -p /tmp/build/app \
-    && touch /tmp/build/app/__init__.py \
-    && pip install --no-cache-dir --upgrade /tmp/build/ \
-    && rm -rf /tmp/build
-# ── Layer dédié mistralai — invalide le cache HF si v0.x est présent ─────
-# Layer séparé de l'install principal pour forcer la mise à jour même si
-# HuggingFace réutilise le layer pyproject.toml depuis un build antérieur.
-RUN pip install --no-cache-dir 'mistralai>=1.0,<2.0'
-# ── Code source backend ────────────────────────────────────────────────────
-COPY backend/app ./backend/app
-COPY profiles/ ./profiles/
-COPY prompts/ ./prompts/
-# ── Frontend buildé ────────────────────────────────────────────────────────
-COPY --from=frontend-builder /frontend/dist ./static
-# ── Répertoire des artefacts (vide dans l'image ; monté en volume) ─────────
-RUN mkdir -p /app/data
-# ── Secrets Google AI : JAMAIS dans l'image (R06) ─────────────────────────
-# Passer au runtime via -e ou docker-compose environment :
-#   AI_PROVIDER, GOOGLE_AI_STUDIO_API_KEY, GOOGLE_AI_API_KEY,
-#   GOOGLE_VERTEX_PROJECT, GOOGLE_VERTEX_LOCATION
-# PYTHONPATH permet l'import `app.main:app` depuis /app/backend/app/
-ENV PYTHONPATH=/app/backend
-ENV PROFILES_DIR=/app/profiles
-ENV PROMPTS_DIR=/app/prompts
-ENV DATA_DIR=/app/data
-EXPOSE 7860
-# 1 worker au MVP — pas de Gunicorn, pas de multiprocessing
-CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]