diff --git a/Dockerfile b/Dockerfile
index 1b15a20031d0a5e87c3cbb1a7c8699689bb4253e..5e0a55d99a6e0d73b0a9dafece3f2a875bd9541e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
# Scriptorium AI — image de production (multi-stage)
# Ce fichier est utilisé par HuggingFace Spaces (SDK docker, détection automatique).
-# Il doit rester synchronisé avec infra/Dockerfile.
+# Source unique — le fichier infra/Dockerfile a été supprimé pour éviter la divergence.
#
# Build depuis la racine du dépôt :
# docker build -t scriptorium-ai .
diff --git a/backend/app/api/v1/corpora.py b/backend/app/api/v1/corpora.py
index a1cb10988c7f4258bce239b1f94a48d3855645fd..d0f56bedbf83c8a0e69ff658ae4e6efbebefe30b 100644
--- a/backend/app/api/v1/corpora.py
+++ b/backend/app/api/v1/corpora.py
@@ -14,8 +14,8 @@ import uuid
from datetime import datetime, timezone
# 2. third-party
-from fastapi import APIRouter, Depends, HTTPException
-from pydantic import BaseModel, ConfigDict
+from fastapi import APIRouter, Depends, HTTPException, Query
+from pydantic import BaseModel, ConfigDict, Field
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
@@ -29,9 +29,9 @@ router = APIRouter(prefix="/corpora", tags=["corpora"])
# ── Schémas de requête / réponse ─────────────────────────────────────────────
class CorpusCreate(BaseModel):
- slug: str
- title: str
- profile_id: str
+ slug: str = Field(..., pattern=r"^[a-z0-9][a-z0-9_-]{0,63}$")
+ title: str = Field(..., min_length=1, max_length=256)
+ profile_id: str = Field(..., pattern=r"^[a-z0-9][a-z0-9_-]*$")
class CorpusResponse(BaseModel):
@@ -59,9 +59,13 @@ class ManuscriptResponse(BaseModel):
# ── Endpoints ────────────────────────────────────────────────────────────────
@router.get("", response_model=list[CorpusResponse])
-async def list_corpora(db: AsyncSession = Depends(get_db)) -> list[CorpusModel]:
- """Retourne tous les corpus enregistrés."""
- result = await db.execute(select(CorpusModel))
+async def list_corpora(
+ db: AsyncSession = Depends(get_db),
+ skip: int = Query(0, ge=0, description="Nombre d'éléments à sauter"),
+ limit: int = Query(100, ge=1, le=1000, description="Nombre maximum d'éléments"),
+) -> list[CorpusModel]:
+ """Retourne les corpus enregistrés (paginé)."""
+ result = await db.execute(select(CorpusModel).offset(skip).limit(limit))
return list(result.scalars().all())
diff --git a/backend/app/api/v1/export.py b/backend/app/api/v1/export.py
index b93aecc3171286920861c174d79fb5df2440040c..f044b98ab978d0146df2770d84eff6c402a76ac7 100644
--- a/backend/app/api/v1/export.py
+++ b/backend/app/api/v1/export.py
@@ -10,6 +10,7 @@ Règle (R02) : toutes les sorties sont générées depuis les PageMasters
(master.json), jamais depuis les réponses brutes de l'IA.
"""
# 1. stdlib
+import asyncio
import io
import json
import logging
@@ -66,7 +67,7 @@ async def _load_manuscript_with_masters(
masters: list[PageMaster] = []
for page in pages:
- master = _read_master_json(corpus.slug, page.id)
+ master = await _read_master_json(corpus.slug, page.id)
if master is not None:
masters.append(master)
@@ -79,8 +80,8 @@ async def _load_manuscript_with_masters(
return manuscript, corpus, masters
-def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
- """Lit le master.json d'une page depuis data/. Retourne None si absent."""
+def _read_master_json_sync(corpus_slug: str, page_id: str) -> PageMaster | None:
+ """Lit le master.json d'une page depuis data/. Retourne None si absent (bloquant)."""
path = (
_config_module.settings.data_dir
/ "corpora"
@@ -95,6 +96,11 @@ def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
return PageMaster.model_validate(raw)
+async def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
+ """Version async — délègue la lecture au threadpool."""
+ return await asyncio.to_thread(_read_master_json_sync, corpus_slug, page_id)
+
+
def _build_manuscript_meta(
manuscript: ManuscriptModel, corpus: CorpusModel
) -> dict:
@@ -154,7 +160,7 @@ async def get_alto(page_id: str, db: AsyncSession = Depends(get_db)) -> Response
manuscript = await db.get(ManuscriptModel, page.manuscript_id)
corpus = await db.get(CorpusModel, manuscript.corpus_id)
- master = _read_master_json(corpus.slug, page_id)
+ master = await _read_master_json(corpus.slug, page_id)
if master is None:
raise HTTPException(
status_code=404,
diff --git a/backend/app/api/v1/ingest.py b/backend/app/api/v1/ingest.py
index d0be5a113df9466e0d8d43c9ba23bcfcb6c69fce..e7d054c02b009ccccc5ef0dd8dfeeda6f7392e9a 100644
--- a/backend/app/api/v1/ingest.py
+++ b/backend/app/api/v1/ingest.py
@@ -11,13 +11,14 @@ Règle : ingestion = création des PageModel en BDD uniquement.
"""
# 1. stdlib
import logging
+import re
import uuid
from pathlib import Path
# 2. third-party
import httpx
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
@@ -30,6 +31,28 @@ logger = logging.getLogger(__name__)
router = APIRouter(tags=["ingestion"])
+# ── Constantes de sécurité ────────────────────────────────────────────────────
+
+_SAFE_LABEL_RE = re.compile(r"^[\w\-\.]+$")
+_MAX_UPLOAD_BYTES = 100 * 1024 * 1024 # 100 Mo par fichier
+_ALLOWED_MIME_PREFIXES = ("image/",)
+
+
+def _sanitize_label(label: str) -> str:
+ """Nettoie un folio_label : garde uniquement alphanum, -, _, ."""
+ clean = Path(label).name # retire tout chemin
+ if not _SAFE_LABEL_RE.match(clean) or not clean:
+ clean = re.sub(r"[^\w\-\.]", "_", clean) or "page"
+ return clean
+
+
+def _sanitize_filename(name: str) -> str:
+ """Nettoie un nom de fichier uploadé : garde uniquement le basename sûr."""
+ clean = Path(name).name
+ if not _SAFE_LABEL_RE.match(clean) or not clean:
+ clean = f"{uuid.uuid4().hex[:12]}.bin"
+ return clean
+
# ── Schémas ───────────────────────────────────────────────────────────────────
@@ -38,8 +61,8 @@ class IIIFManifestRequest(BaseModel):
class IIIFImagesRequest(BaseModel):
- urls: list[str]
- folio_labels: list[str]
+ urls: list[str] = Field(..., max_length=5000)
+ folio_labels: list[str] = Field(..., max_length=5000)
class IngestResponse(BaseModel):
@@ -144,11 +167,31 @@ _MANIFEST_HEADERS = {
}
+_MAX_MANIFEST_BYTES = 10 * 1024 * 1024 # 10 Mo max pour un manifest JSON
+
+
+def _validate_url(url: str) -> None:
+ """Rejette les URLs non-HTTP et les cibles réseau privé (SSRF)."""
+ from urllib.parse import urlparse
+
+ parsed = urlparse(url)
+ if parsed.scheme not in ("http", "https"):
+ raise ValueError(f"Schéma non autorisé : {parsed.scheme!r}")
+ host = (parsed.hostname or "").lower()
+ # Bloquer les adresses privées / locales
+ blocked = ("localhost", "127.0.0.1", "0.0.0.0", "[::1]", "metadata.google.internal")
+ if host in blocked or host.startswith("169.254.") or host.startswith("10.") or host.startswith("192.168."):
+ raise ValueError(f"Hôte interdit : {host}")
+
+
async def _fetch_json_manifest(url: str) -> dict:
- """Télécharge un manifest IIIF. Fonction isolée pour faciliter les tests."""
+ """Télécharge un manifest IIIF avec protections SSRF + taille max."""
+ _validate_url(url)
async with httpx.AsyncClient() as client:
resp = await client.get(url, headers=_MANIFEST_HEADERS, follow_redirects=True, timeout=30.0)
resp.raise_for_status()
+ if len(resp.content) > _MAX_MANIFEST_BYTES:
+ raise ValueError(f"Manifest trop volumineux ({len(resp.content)} octets)")
return resp.json()
@@ -202,16 +245,33 @@ async def ingest_files(
seq = await _next_sequence(db, ms.id)
# Collect labels and detect duplicates
- labels = [Path(f.filename or f"file_{i}").stem for i, f in enumerate(files)]
+ labels = [_sanitize_label(Path(f.filename or f"file_{i}").stem) for i, f in enumerate(files)]
dupes = _find_duplicate_labels(labels)
created: list[PageModel] = []
+ written_files: list[Path] = []
skipped = 0
for i, upload in enumerate(files):
- filename = Path(upload.filename or f"file_{i}").name
+ # Validation MIME type
+ ctype = upload.content_type or ""
+ if not any(ctype.startswith(p) for p in _ALLOWED_MIME_PREFIXES):
+ raise HTTPException(
+ status_code=422,
+ detail=f"Type MIME non autorisé : {ctype!r}. Seules les images sont acceptées.",
+ )
+
+ filename = _sanitize_filename(upload.filename or f"file_{i}.bin")
folio_label = labels[i]
page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
+ content = await upload.read()
+ # Validation taille
+ if len(content) > _MAX_UPLOAD_BYTES:
+ raise HTTPException(
+ status_code=413,
+ detail=f"Fichier trop volumineux ({len(content)} octets). Maximum : {_MAX_UPLOAD_BYTES}.",
+ )
+
master_dir = (
_config_module.settings.data_dir
/ "corpora"
@@ -221,8 +281,8 @@ async def ingest_files(
)
master_dir.mkdir(parents=True, exist_ok=True)
master_path = master_dir / filename
- content = await upload.read()
master_path.write_bytes(content)
+ written_files.append(master_path)
page = await _create_page(
db, ms.id, page_id, folio_label, seq + i,
@@ -234,7 +294,13 @@ async def ingest_files(
created.append(page)
ms.total_pages = (ms.total_pages or 0) + len(created)
- await db.commit()
+ try:
+ await db.commit()
+ except Exception:
+ # Nettoyage des fichiers orphelins si le commit BDD échoue
+ for f in written_files:
+ f.unlink(missing_ok=True)
+ raise
logger.info(
"Fichiers ingérés",
@@ -260,6 +326,8 @@ async def ingest_iiif_manifest(
try:
manifest = await _fetch_json_manifest(body.manifest_url)
+ except ValueError as exc:
+ raise HTTPException(status_code=400, detail=str(exc))
except httpx.HTTPStatusError as exc:
raise HTTPException(
status_code=502,
@@ -302,7 +370,7 @@ async def ingest_iiif_manifest(
seq = await _next_sequence(db, ms.id)
# Collect labels and detect duplicates
- labels = [_extract_canvas_label(canvas, i) for i, canvas in enumerate(canvases)]
+ labels = [_sanitize_label(_extract_canvas_label(canvas, i)) for i, canvas in enumerate(canvases)]
dupes = _find_duplicate_labels(labels)
created: list[PageModel] = []
@@ -358,11 +426,12 @@ async def ingest_iiif_images(
ms = await _get_or_create_manuscript(db, corpus_id)
seq = await _next_sequence(db, ms.id)
- dupes = _find_duplicate_labels(body.folio_labels)
+ sanitized_labels = [_sanitize_label(lbl) for lbl in body.folio_labels]
+ dupes = _find_duplicate_labels(sanitized_labels)
created: list[PageModel] = []
skipped = 0
- for i, (url, folio_label) in enumerate(zip(body.urls, body.folio_labels)):
+ for i, (url, folio_label) in enumerate(zip(body.urls, sanitized_labels)):
page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
page = await _create_page(
db, ms.id, page_id, folio_label, seq + i,
diff --git a/backend/app/api/v1/jobs.py b/backend/app/api/v1/jobs.py
index c0330d1a408c5336554f0adfafc0eee18dbbf93e..1aae41bbf0b052ae177313b9c14080d0d7a77cf9 100644
--- a/backend/app/api/v1/jobs.py
+++ b/backend/app/api/v1/jobs.py
@@ -22,8 +22,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
from app.models.database import get_db
from app.models.job import JobModel
-from app.services.corpus_runner import execute_corpus_job
-from app.services.job_runner import execute_page_job
router = APIRouter(tags=["jobs"])
@@ -101,6 +99,8 @@ async def run_corpus(
await db.commit()
# Lancer le pipeline en arrière-plan (après envoi de la réponse)
+ from app.services.corpus_runner import execute_corpus_job
+
background_tasks.add_task(execute_corpus_job, corpus_id)
return CorpusRunResponse(
@@ -135,6 +135,8 @@ async def run_page(
await db.refresh(job)
# Lancer le pipeline en arrière-plan (après envoi de la réponse)
+ from app.services.job_runner import execute_page_job
+
background_tasks.add_task(execute_page_job, job.id)
return job
@@ -175,6 +177,8 @@ async def retry_job(
await db.refresh(job)
# Relancer le pipeline
+ from app.services.job_runner import execute_page_job
+
background_tasks.add_task(execute_page_job, job.id)
return job
diff --git a/backend/app/api/v1/models_api.py b/backend/app/api/v1/models_api.py
index ac4614c31fd74afceb1fa3f0dc00e2b93d909686..3960f64586dca0399241a3be5a14eef4297e3804 100644
--- a/backend/app/api/v1/models_api.py
+++ b/backend/app/api/v1/models_api.py
@@ -17,7 +17,7 @@ from datetime import datetime, timezone
# 2. third-party
from fastapi import APIRouter, Depends, HTTPException
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field
from sqlalchemy.ext.asyncio import AsyncSession
# 3. local
@@ -25,11 +25,6 @@ from app.models.corpus import CorpusModel
from app.models.database import get_db
from app.models.model_config_db import ModelConfigDB
from app.schemas.model_config import ProviderType
-from app.services.ai.model_registry import (
- get_available_providers,
- list_all_models,
- list_models_for_provider,
-)
logger = logging.getLogger(__name__)
@@ -47,9 +42,9 @@ class ProviderInfo(BaseModel):
class ModelSelectRequest(BaseModel):
- model_id: str
- provider_type: str
- display_name: str = ""
+ model_id: str = Field(..., min_length=1, max_length=256)
+ provider_type: str = Field(..., min_length=1, max_length=64)
+ display_name: str = Field("", max_length=256)
class ModelConfigResponse(BaseModel):
@@ -77,6 +72,8 @@ async def list_providers() -> list[dict]:
Un provider est disponible si la variable d'environnement correspondante
est présente dans les secrets HuggingFace. Aucune clé n'est exposée.
"""
+ from app.services.ai.model_registry import get_available_providers
+
return get_available_providers()
@@ -91,6 +88,8 @@ async def get_provider_models(provider_type: str) -> list[dict]:
detail=f"Provider inconnu : {provider_type}. "
f"Valeurs acceptées : {[p.value for p in ProviderType]}",
)
+ from app.services.ai.model_registry import list_models_for_provider
+
try:
models = list_models_for_provider(ptype)
except RuntimeError as exc:
@@ -104,6 +103,8 @@ async def get_provider_models(provider_type: str) -> list[dict]:
@router.post("/models/refresh", response_model=ModelsRefreshResponse)
async def refresh_models() -> ModelsRefreshResponse:
"""Force la mise à jour de la liste agrégée de tous les modèles disponibles."""
+ from app.services.ai.model_registry import list_all_models
+
models = list_all_models()
return ModelsRefreshResponse(
models=[m.model_dump() for m in models],
diff --git a/backend/app/api/v1/pages.py b/backend/app/api/v1/pages.py
index c82ccfa3e6055a30c6cecc5780ac9203e6a27a22..e4413fd83ffb1671b3fa556ad5df3d0c7be31f9a 100644
--- a/backend/app/api/v1/pages.py
+++ b/backend/app/api/v1/pages.py
@@ -18,7 +18,7 @@ from typing import Any
# 2. third-party
from fastapi import APIRouter, Depends, HTTPException
-from pydantic import BaseModel, ConfigDict, ValidationError
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
from sqlalchemy.ext.asyncio import AsyncSession
# 3. local
@@ -42,12 +42,12 @@ class CorrectionsRequest(BaseModel):
indiquée est restaurée (avec incrémentation de editorial.version).
"""
- ocr_diplomatic_text: str | None = None
- editorial_status: str | None = None
- commentary_public: str | None = None
- commentary_scholarly: str | None = None
+ ocr_diplomatic_text: str | None = Field(None, max_length=500_000)
+ editorial_status: str | None = Field(None, max_length=50)
+ commentary_public: str | None = Field(None, max_length=100_000)
+ commentary_scholarly: str | None = Field(None, max_length=100_000)
region_validations: dict[str, str] | None = None
- restore_to_version: int | None = None
+ restore_to_version: int | None = Field(None, ge=1)
class VersionInfo(BaseModel):
diff --git a/backend/app/api/v1/profiles.py b/backend/app/api/v1/profiles.py
index 0f417bd9b7b595d3d81dea243daed6ab709e106f..9c9f119816172f4e0b23df769675660a2374984c 100644
--- a/backend/app/api/v1/profiles.py
+++ b/backend/app/api/v1/profiles.py
@@ -8,8 +8,10 @@ Les profils sont des fichiers JSON dans profiles/ (racine du dépôt).
Ils sont validés par CorpusProfile avant d'être retournés.
"""
# 1. stdlib
+import asyncio
import json
import logging
+import re
from pathlib import Path
# 2. third-party
@@ -49,21 +51,36 @@ async def list_profiles() -> list[dict]:
if not settings.profiles_dir.is_dir():
logger.warning("profiles_dir introuvable : %s", settings.profiles_dir)
return []
- profiles = []
- for path in sorted(settings.profiles_dir.glob("*.json")):
- profile = _load_profile(path)
- if profile is not None:
- profiles.append(profile.model_dump())
- return profiles
+
+ def _scan_profiles() -> list[dict]:
+ result = []
+ for path in sorted(settings.profiles_dir.glob("*.json")):
+ profile = _load_profile(path)
+ if profile is not None:
+ result.append(profile.model_dump())
+ return result
+
+ return await asyncio.to_thread(_scan_profiles)
+
+
+_SAFE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]*$")
@router.get("/{profile_id}", response_model=dict)
async def get_profile(profile_id: str) -> dict:
"""Retourne un profil par son id (nom du fichier sans extension)."""
+ if not _SAFE_ID_RE.match(profile_id):
+ raise HTTPException(status_code=400, detail="profile_id invalide")
path = settings.profiles_dir / f"{profile_id}.json"
- if not path.exists():
+
+ def _read() -> CorpusProfile | None:
+ if not path.exists():
+ return None
+ return _load_profile(path)
+
+ profile = await asyncio.to_thread(_read)
+ if profile is None and not path.exists():
raise HTTPException(status_code=404, detail="Profil introuvable")
- profile = _load_profile(path)
if profile is None:
raise HTTPException(status_code=422, detail="Profil invalide")
return profile.model_dump()
diff --git a/backend/app/api/v1/search.py b/backend/app/api/v1/search.py
index 635ed605a3a9b66253cdb266f63bfb7ac3798c64..4d730b6900642d0449c2d5a45b4a9ebb216ab898 100644
--- a/backend/app/api/v1/search.py
+++ b/backend/app/api/v1/search.py
@@ -7,6 +7,7 @@ Implémentation MVP : scan des fichiers master.json (pas d'index externe).
Insensible à la casse et aux accents (unicodedata NFD + ASCII).
"""
# 1. stdlib
+import asyncio
import json
import logging
import unicodedata
@@ -95,7 +96,8 @@ def _score_master(data: dict, query_normalized: str) -> tuple[int, str]:
@router.get("/search", response_model=list[SearchResult])
async def search_pages(
- q: str = Query(..., min_length=2, description="Requête de recherche (min. 2 caractères)"),
+ q: str = Query(..., min_length=2, max_length=500, description="Requête de recherche (2–500 caractères)"),
+ limit: int = Query(200, ge=1, le=2000, description="Nombre maximum de résultats"),
) -> list[SearchResult]:
"""Recherche plein texte dans les master.json de tous les corpus.
@@ -106,29 +108,32 @@ async def search_pages(
query_normalized = _normalize(q.strip())
data_dir = _config_module.settings.data_dir
- results: list[SearchResult] = []
-
- for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
- try:
- raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
- except (json.JSONDecodeError, OSError):
- continue
-
- score, excerpt = _score_master(raw, query_normalized)
- if score == 0:
- continue
-
- results.append(
- SearchResult(
- page_id=raw.get("page_id", ""),
- folio_label=raw.get("folio_label", ""),
- manuscript_id=raw.get("manuscript_id", ""),
- excerpt=excerpt,
- score=score,
- corpus_profile=raw.get("corpus_profile", ""),
+ def _scan() -> list[SearchResult]:
+ """Scan bloquant exécuté dans un thread dédié."""
+ hits: list[SearchResult] = []
+ for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
+ try:
+ raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
+ except (json.JSONDecodeError, OSError):
+ continue
+
+ score, excerpt = _score_master(raw, query_normalized)
+ if score == 0:
+ continue
+
+ hits.append(
+ SearchResult(
+ page_id=raw.get("page_id", ""),
+ folio_label=raw.get("folio_label", ""),
+ manuscript_id=raw.get("manuscript_id", ""),
+ excerpt=excerpt,
+ score=score,
+ corpus_profile=raw.get("corpus_profile", ""),
+ )
)
- )
+ hits.sort(key=lambda r: r.score, reverse=True)
+ return hits
- results.sort(key=lambda r: r.score, reverse=True)
+ results = await asyncio.to_thread(_scan)
logger.info("Recherche exécutée", extra={"q": q, "results": len(results)})
- return results
+ return results[:limit]
diff --git a/backend/app/config.py b/backend/app/config.py
index 80c735f6bfe81d1e26ccb10b1d9a0e52dd292d5e..0e62a54956cf0eb7a87f151ff5800256008276db 100644
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -1,17 +1,17 @@
"""
Configuration globale de la plateforme, chargée depuis les variables d'environnement.
-Équivalent fonctionnel de pydantic-settings sans dépendance externe :
- - les valeurs sont lues depuis os.environ au moment de l'instanciation
+Utilise pydantic-settings (CLAUDE.md §2, §7) :
+ - les valeurs sont lues depuis os.environ / fichier .env au moment de l'instanciation
- l'objet `settings` est importé partout dans l'application
- dans les tests : monkeypatch.setattr(config, "settings", ...) pour surcharger
"""
# 1. stdlib
-import os
from pathlib import Path
# 2. third-party
-from pydantic import BaseModel, ConfigDict
+from pydantic import ConfigDict
+from pydantic_settings import BaseSettings
# Racine du dépôt — résolue depuis l'emplacement absolu de ce fichier.
# config.py se trouve dans backend/app/ ; 3 parents remontent à la racine.
@@ -19,14 +19,17 @@ from pydantic import BaseModel, ConfigDict
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
-class Settings(BaseModel):
+class Settings(BaseSettings):
"""Paramètres d'application lus depuis les variables d'environnement.
Toutes les clés API sont optionnelles (None si non configurées).
Elles ne sont jamais loguées ni exportées (R06).
"""
- model_config = ConfigDict(frozen=False)
+ model_config = ConfigDict(
+ env_file=".env",
+ extra="ignore",
+ )
# ── Serveur ──────────────────────────────────────────────────────────────
base_url: str = "http://localhost:8000"
@@ -50,21 +53,4 @@ class Settings(BaseModel):
mistral_api_key: str | None = None
-def _load_settings() -> Settings:
- """Lit les variables d'environnement et construit l'objet Settings."""
- return Settings(
- base_url=os.getenv("BASE_URL", "http://localhost:8000"),
- data_dir=Path(os.getenv("DATA_DIR", "data")),
- profiles_dir=Path(os.getenv("PROFILES_DIR", str(_REPO_ROOT / "profiles"))),
- prompts_dir=Path(os.getenv("PROMPTS_DIR", str(_REPO_ROOT / "prompts"))),
- database_url=os.getenv(
- "DATABASE_URL", "sqlite+aiosqlite:///./scriptorium.db"
- ),
- google_ai_studio_api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY"),
- vertex_api_key=os.getenv("VERTEX_API_KEY"),
- vertex_service_account_json=os.getenv("VERTEX_SERVICE_ACCOUNT_JSON"),
- mistral_api_key=os.getenv("MISTRAL_API_KEY"),
- )
-
-
-settings: Settings = _load_settings()
+settings: Settings = Settings()
diff --git a/backend/app/main.py b/backend/app/main.py
index 798618c3d67513fde17fdc4dd4700baca676b585..ab052ec6fe904155f38142c239934f966ecab42a 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -65,11 +65,11 @@ app = FastAPI(
lifespan=lifespan,
)
-# ── CORS (dev : tous les origines autorisés) ──────────────────────────────────
+# ── CORS (dev : toutes les origines autorisées, sans credentials) ──────────────
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
- allow_credentials=True,
+ allow_credentials=False,
allow_methods=["*"],
allow_headers=["*"],
)
@@ -97,8 +97,9 @@ async def serve_frontend(full_path: str) -> FileResponse | RedirectResponse:
if full_path.startswith("api/"):
raise HTTPException(status_code=404, detail=f"Endpoint not found: /{full_path}")
if _STATIC_DIR.is_dir():
- candidate = _STATIC_DIR / full_path
- if candidate.is_file():
+ candidate = (_STATIC_DIR / full_path).resolve()
+ # Empêcher le path traversal : le fichier résolu doit être sous _STATIC_DIR
+ if candidate.is_file() and str(candidate).startswith(str(_STATIC_DIR.resolve())):
return FileResponse(candidate)
index = _STATIC_DIR / "index.html"
if index.exists():
diff --git a/backend/app/models/corpus.py b/backend/app/models/corpus.py
index e8a480594114c28bf31c963a998d421df9870a9a..727dac66102f161a7e789f45571328808eb78a3f 100644
--- a/backend/app/models/corpus.py
+++ b/backend/app/models/corpus.py
@@ -6,6 +6,7 @@ Ils NE se substituent PAS aux schémas Pydantic (source canonique des types).
"""
# 1. stdlib
from datetime import datetime, timezone
+from functools import partial
# 2. third-party
from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, Text
@@ -24,8 +25,12 @@ class CorpusModel(Base):
slug: Mapped[str] = mapped_column(String, unique=True, nullable=False, index=True)
title: Mapped[str] = mapped_column(String, nullable=False)
profile_id: Mapped[str] = mapped_column(String, nullable=False)
- created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
- updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
+ created_at: Mapped[datetime] = mapped_column(
+ DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
+ )
+ updated_at: Mapped[datetime] = mapped_column(
+ DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
+ )
manuscripts: Mapped[list["ManuscriptModel"]] = relationship(
back_populates="corpus", cascade="all, delete-orphan"
diff --git a/backend/app/models/job.py b/backend/app/models/job.py
index 26e3b8fab3c7d5f7899d7da665787b3c5afa8491..c47f13b90bcc9a5009dfade80f0aee9ca03b60f7 100644
--- a/backend/app/models/job.py
+++ b/backend/app/models/job.py
@@ -10,7 +10,8 @@ Cycle de vie :
↘ failed
"""
# 1. stdlib
-from datetime import datetime
+from datetime import datetime, timezone
+from functools import partial
# 2. third-party
from sqlalchemy import DateTime, ForeignKey, String, Text
@@ -37,4 +38,6 @@ class JobModel(Base):
started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
- created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
+ created_at: Mapped[datetime] = mapped_column(
+ DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
+ )
diff --git a/backend/app/schemas/page_master.py b/backend/app/schemas/page_master.py
index c363dee1766186387e321a10facbe4c049d2c1be..9e06728aa481c9b05ab1ea190e56c9a42c416027 100644
--- a/backend/app/schemas/page_master.py
+++ b/backend/app/schemas/page_master.py
@@ -29,14 +29,25 @@ class Region(BaseModel):
@field_validator("bbox")
@classmethod
- def bbox_must_be_positive(cls, v: list[int]) -> list[int]:
+ def bbox_must_be_valid(cls, v: list[int]) -> list[int]:
if any(x < 0 for x in v):
- raise ValueError("bbox values must be >= 0")
+ raise ValueError("bbox: toutes les valeurs doivent être >= 0")
if v[2] <= 0 or v[3] <= 0:
- raise ValueError("bbox width and height must be > 0")
+ raise ValueError("bbox: width et height doivent être > 0")
return v
+class ImageInfo(BaseModel):
+ """Métadonnées image — CLAUDE.md §4.2."""
+
+ master: str
+ derivative_web: str | None = None
+ thumbnail: str | None = None
+ iiif_base: str | None = None
+ width: int
+ height: int
+
+
class OCRResult(BaseModel):
diplomatic_text: str = ""
blocks: list[dict] = []
@@ -51,6 +62,13 @@ class Translation(BaseModel):
en: str = ""
+class Summary(BaseModel):
+ """Résumé — CLAUDE.md §4.2."""
+
+ short: str = ""
+ detailed: str = ""
+
+
class CommentaryClaim(BaseModel):
claim: str
evidence_region_ids: list[str] = []
@@ -64,6 +82,7 @@ class Commentary(BaseModel):
class ProcessingInfo(BaseModel):
+ provider: str
model_id: str
model_display_name: str
prompt_version: str
@@ -96,11 +115,11 @@ class PageMaster(BaseModel):
folio_label: str
sequence: int
- image: dict
+ image: ImageInfo
layout: dict
ocr: OCRResult | None = None
translation: Translation | None = None
- summary: dict | None = None
+ summary: Summary | None = None
commentary: Commentary | None = None
extensions: dict[str, Any] = {}
diff --git a/backend/app/services/ai/__init__.py b/backend/app/services/ai/__init__.py
index e54c052927be869e1afa1b09b8c873a08171294a..fa455bfe3bce8364291e5424693770059c29c2d6 100644
--- a/backend/app/services/ai/__init__.py
+++ b/backend/app/services/ai/__init__.py
@@ -1,19 +1,31 @@
"""
Services AI — providers Google AI, registre de modèles, et analyse IA.
+
+Les imports de providers sont différés (lazy) pour éviter de charger les SDK
+tiers (google-genai, mistralai) au démarrage. Cela permet à l'application
+de fonctionner même si un SDK n'est pas installé.
"""
-from app.services.ai.analyzer import run_primary_analysis
-from app.services.ai.client_factory import build_client
-from app.services.ai.model_registry import build_model_config, list_all_models
-from app.services.ai.prompt_loader import load_and_render_prompt
-from app.services.ai.provider_google_ai import GoogleAIProvider
-from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
-from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
-from app.services.ai.response_parser import ParseError, parse_ai_response
+
+
+def __getattr__(name: str):
+ """Import paresseux — les symboles sont résolus au premier accès."""
+ _lazy_map = {
+ "run_primary_analysis": "app.services.ai.analyzer",
+ "build_client": "app.services.ai.client_factory",
+ "build_model_config": "app.services.ai.model_registry",
+ "list_all_models": "app.services.ai.model_registry",
+ "load_and_render_prompt": "app.services.ai.prompt_loader",
+ "parse_ai_response": "app.services.ai.response_parser",
+ "ParseError": "app.services.ai.response_parser",
+ }
+ if name in _lazy_map:
+ import importlib
+ module = importlib.import_module(_lazy_map[name])
+ return getattr(module, name)
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
__all__ = [
- "GoogleAIProvider",
- "VertexAPIKeyProvider",
- "VertexServiceAccountProvider",
"list_all_models",
"build_model_config",
"build_client",
diff --git a/backend/app/services/ai/analyzer.py b/backend/app/services/ai/analyzer.py
index 874d29deeccffc3c460a849854b84ac05177486c..e0c14e55f2b881443c8da999732512e5a67c96d5 100644
--- a/backend/app/services/ai/analyzer.py
+++ b/backend/app/services/ai/analyzer.py
@@ -13,8 +13,8 @@ from pathlib import Path
from app.schemas.corpus_profile import CorpusProfile
from app.schemas.image import ImageDerivativeInfo
from app.schemas.model_config import ModelConfig
-from app.schemas.page_master import EditorialInfo, EditorialStatus, PageMaster, ProcessingInfo
-from app.services.ai.master_writer import write_gemini_raw, write_master_json
+from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
+from app.services.ai.master_writer import write_ai_raw, write_master_json
from app.services.ai.model_registry import get_provider
from app.services.ai.prompt_loader import load_and_render_prompt
from app.services.ai.response_parser import ParseError, parse_ai_response # noqa: F401
@@ -37,7 +37,7 @@ def run_primary_analysis(
) -> PageMaster:
"""Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
- Respecte R05 : gemini_raw.json est toujours écrit en premier, même en cas
+ Respecte R05 : ai_raw.json est toujours écrit en premier, même en cas
d'erreur de parsing. master.json n'est écrit QUE si le parsing a réussi.
Le provider est sélectionné dynamiquement depuis model_config.provider ;
@@ -57,7 +57,7 @@ def run_primary_analysis(
project_root: racine du projet (pour résoudre les chemins des prompts).
Returns:
- PageMaster validé (gemini_raw.json et master.json écrits sur disque).
+ PageMaster validé (ai_raw.json et master.json écrits sur disque).
Raises:
ParseError: si la réponse IA n'est pas un JSON valide.
@@ -66,7 +66,7 @@ def run_primary_analysis(
"""
# ── Chemins de sortie ───────────────────────────────────────────────────
page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
- raw_path = page_dir / "gemini_raw.json"
+ raw_path = page_dir / "ai_raw.json"
master_path = page_dir / "master.json"
# ── 1. Chargement et rendu du prompt (R04) ──────────────────────────────
@@ -76,6 +76,7 @@ def run_primary_analysis(
context = {
"profile_label": corpus_profile.label,
"language_hints": ", ".join(corpus_profile.language_hints),
+ "primary_language": corpus_profile.language_hints[0] if corpus_profile.language_hints else "la",
"script_type": corpus_profile.script_type.value,
}
prompt_text = load_and_render_prompt(prompt_abs_path, context)
@@ -85,7 +86,12 @@ def run_primary_analysis(
)
# ── 2. Chargement de l'image dérivée ────────────────────────────────────
- jpeg_bytes = derivative_image_path.read_bytes()
+ if not derivative_image_path.exists():
+ raise FileNotFoundError(f"Image dérivée introuvable : {derivative_image_path}")
+ try:
+ jpeg_bytes = derivative_image_path.read_bytes()
+ except OSError as exc:
+ raise RuntimeError(f"Erreur lecture image {derivative_image_path} : {exc}") from exc
# ── 3. Appel IA via le provider sélectionné ─────────────────────────────
provider = get_provider(model_config.provider)
@@ -104,8 +110,8 @@ def run_primary_analysis(
model_id=model_config.selected_model_id,
)
- # ── 4. Écriture gemini_raw.json TOUJOURS EN PREMIER (R05) ───────────────
- write_gemini_raw(raw_text, raw_path)
+ # ── 4. Écriture ai_raw.json TOUJOURS EN PREMIER (R05) ─────────────────
+ write_ai_raw(raw_text, raw_path)
# ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
layout, ocr = parse_ai_response(raw_text)
@@ -118,16 +124,17 @@ def run_primary_analysis(
manuscript_id=manuscript_id,
folio_label=folio_label,
sequence=sequence,
- image={
- "original_url": image_info.original_url,
- "derivative_web": image_info.derivative_path,
- "thumbnail": image_info.thumbnail_path,
- "width": image_info.derivative_width,
- "height": image_info.derivative_height,
- },
+ image=ImageInfo(
+ master=image_info.original_url,
+ derivative_web=image_info.derivative_path,
+ thumbnail=image_info.thumbnail_path,
+ width=image_info.derivative_width,
+ height=image_info.derivative_height,
+ ),
layout=layout,
ocr=ocr,
processing=ProcessingInfo(
+ provider=model_config.provider.value if hasattr(model_config.provider, "value") else str(model_config.provider),
model_id=model_config.selected_model_id,
model_display_name=model_config.selected_model_display_name,
prompt_version=prompt_rel_path,
diff --git a/backend/app/services/ai/master_writer.py b/backend/app/services/ai/master_writer.py
index 037dd430176e22b902ee5be26a009076f8985328..f94d47be3b9b3a0bb644769dfa432fc967780b1e 100644
--- a/backend/app/services/ai/master_writer.py
+++ b/backend/app/services/ai/master_writer.py
@@ -1,8 +1,8 @@
"""
-Écriture des fichiers gemini_raw.json et master.json (R02, R05).
+Écriture des fichiers ai_raw.json et master.json (R02, R05).
Règle R05 non négociable :
- 1. gemini_raw.json est TOUJOURS écrit en premier.
+ 1. ai_raw.json est TOUJOURS écrit en premier.
2. master.json n'est écrit QUE si le parsing et la validation Pydantic ont réussi.
"""
# 1. stdlib
@@ -16,24 +16,24 @@ from app.schemas.page_master import PageMaster
logger = logging.getLogger(__name__)
-def write_gemini_raw(raw_text: str, output_path: Path) -> None:
- """Écrit la réponse brute de l'IA dans gemini_raw.json (R05).
+def write_ai_raw(raw_text: str, output_path: Path) -> None:
+ """Écrit la réponse brute de l'IA dans ai_raw.json (R05).
Toujours appelé AVANT toute tentative de parsing.
Le contenu est enveloppé dans un objet JSON pour garantir un fichier valide,
même si la réponse IA n'est pas du JSON.
-
- Args:
- raw_text: texte brut retourné par l'API Google AI.
- output_path: chemin complet du fichier de sortie (gemini_raw.json).
"""
- output_path.parent.mkdir(parents=True, exist_ok=True)
- payload = {"response_text": raw_text}
- output_path.write_text(
- json.dumps(payload, ensure_ascii=False, indent=2),
- encoding="utf-8",
- )
- logger.info("gemini_raw.json écrit", extra={"path": str(output_path)})
+ try:
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ payload = {"response_text": raw_text}
+ output_path.write_text(
+ json.dumps(payload, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+ except OSError as exc:
+ logger.error("Écriture ai_raw.json échouée", extra={"path": str(output_path), "error": str(exc)})
+ raise
+ logger.info("ai_raw.json écrit", extra={"path": str(output_path)})
def write_master_json(page_master: PageMaster, output_path: Path) -> None:
@@ -41,14 +41,14 @@ def write_master_json(page_master: PageMaster, output_path: Path) -> None:
N'est appelé QUE si le parsing et la validation Pydantic ont réussi.
Crée les dossiers parents si nécessaire.
-
- Args:
- page_master: instance PageMaster validée par Pydantic.
- output_path: chemin complet du fichier de sortie (master.json).
"""
- output_path.parent.mkdir(parents=True, exist_ok=True)
- output_path.write_text(
- page_master.model_dump_json(indent=2),
- encoding="utf-8",
- )
+ try:
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_text(
+ page_master.model_dump_json(indent=2),
+ encoding="utf-8",
+ )
+ except OSError as exc:
+ logger.error("Écriture master.json échouée", extra={"path": str(output_path), "error": str(exc)})
+ raise
logger.info("master.json écrit", extra={"path": str(output_path)})
diff --git a/backend/app/services/ai/model_registry.py b/backend/app/services/ai/model_registry.py
index 273c31e2dcc1ae372fb2f3d801300dfa9a386566..682c06e3ac49565b00084b6ba63bbb6221aee026 100644
--- a/backend/app/services/ai/model_registry.py
+++ b/backend/app/services/ai/model_registry.py
@@ -1,5 +1,8 @@
"""
Registre agrégé des modèles disponibles tous providers confondus.
+
+Les imports de providers sont différés dans _build_providers() pour éviter
+de charger les SDK tiers (google-genai, mistralai) au niveau module.
"""
# 1. stdlib
import logging
@@ -8,10 +11,6 @@ from datetime import datetime, timezone
# 2. local
from app.schemas.model_config import ModelConfig, ModelInfo, ProviderType
from app.services.ai.base import AIProvider
-from app.services.ai.provider_google_ai import GoogleAIProvider
-from app.services.ai.provider_mistral import MistralProvider
-from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
-from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
logger = logging.getLogger(__name__)
@@ -24,13 +23,27 @@ _PROVIDER_DISPLAY_NAMES: dict[ProviderType, str] = {
}
+_cached_providers: list[AIProvider] | None = None
+
+
def _build_providers() -> list[AIProvider]:
- return [
+ """Construit la liste des providers — imports différés, résultat mis en cache."""
+ global _cached_providers
+ if _cached_providers is not None:
+ return _cached_providers
+
+ from app.services.ai.provider_google_ai import GoogleAIProvider
+ from app.services.ai.provider_mistral import MistralProvider
+ from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
+ from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
+
+ _cached_providers = [
GoogleAIProvider(),
VertexAPIKeyProvider(),
VertexServiceAccountProvider(),
MistralProvider(),
]
+ return _cached_providers
def get_available_providers() -> list[dict]:
diff --git a/backend/app/services/ai/prompt_loader.py b/backend/app/services/ai/prompt_loader.py
index 1ae8c457278941dd7af8d054b837ace72bb15875..b2ea12862ff9183df68b3c76b5a7326d8ebf10c9 100644
--- a/backend/app/services/ai/prompt_loader.py
+++ b/backend/app/services/ai/prompt_loader.py
@@ -6,6 +6,7 @@ Le code charge le fichier, substitue les variables {{nom}}, envoie à l'API.
"""
# 1. stdlib
import logging
+import re
from pathlib import Path
logger = logging.getLogger(__name__)
@@ -38,6 +39,11 @@ def load_and_render_prompt(template_path: str | Path, context: dict[str, str]) -
for key, value in context.items():
rendered = rendered.replace("{{" + key + "}}", value)
+ # Vérifier qu'il ne reste pas de variables non résolues (CLAUDE.md §8)
+ unresolved = re.findall(r"\{\{\w+\}\}", rendered)
+ if unresolved:
+ raise ValueError(f"Variables non résolues dans le prompt : {unresolved}")
+
logger.debug(
"Prompt chargé et rendu",
extra={"template": str(path), "variables": list(context.keys())},
diff --git a/backend/app/services/ai/provider_google_ai.py b/backend/app/services/ai/provider_google_ai.py
index 64e8ca47b318bf73849e582f62b79216f40f3a9c..46c48415e1a28a161aef1fe4c3411d19064e1535 100644
--- a/backend/app/services/ai/provider_google_ai.py
+++ b/backend/app/services/ai/provider_google_ai.py
@@ -60,8 +60,15 @@ class GoogleAIProvider(AIProvider):
raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
client = genai.Client(api_key=os.environ[_ENV_KEY])
image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
- response = client.models.generate_content(
- model=model_id,
- contents=[image_part, prompt],
- )
+ try:
+ response = client.models.generate_content(
+ model=model_id,
+ contents=[image_part, prompt],
+ )
+ except Exception as exc:
+ logger.error(
+ "Appel API Google AI Studio échoué",
+ extra={"model": model_id, "error": str(exc)},
+ )
+ raise RuntimeError(f"Erreur API Google AI Studio ({model_id}) : {exc}") from exc
return response.text or ""
diff --git a/backend/app/services/ai/provider_mistral.py b/backend/app/services/ai/provider_mistral.py
index 8e54080a18e18ee8d417fefd957e8b2c18738603..23db6f8dff2da5417bb513a5e0fea8d24bfbac6e 100644
--- a/backend/app/services/ai/provider_mistral.py
+++ b/backend/app/services/ai/provider_mistral.py
@@ -208,10 +208,14 @@ class MistralProvider(AIProvider):
# ── Chemin 1 : OCR dédié ─────────────────────────────────────────────
if _is_ocr_model(model_id):
logger.info("Mistral OCR : endpoint dédié client.ocr.process()", extra={"model": model_id})
- response = client.ocr.process(
- model=model_id,
- document={"type": "image_url", "image_url": {"url": data_url}},
- )
+ try:
+ response = client.ocr.process(
+ model=model_id,
+ document={"type": "image_url", "image_url": {"url": data_url}},
+ )
+ except Exception as exc:
+ logger.error("Appel Mistral OCR échoué", extra={"model": model_id, "error": str(exc)})
+ raise RuntimeError(f"Erreur API Mistral OCR ({model_id}) : {exc}") from exc
# OCRResponse.pages : list[OCRPageObject], chacun avec .markdown
pages = getattr(response, "pages", []) or []
return "\n\n".join(
@@ -233,10 +237,14 @@ class MistralProvider(AIProvider):
)
content = prompt
- response = client.chat.complete(
- model=model_id,
- messages=[{"role": "user", "content": content}],
- )
+ try:
+ response = client.chat.complete(
+ model=model_id,
+ messages=[{"role": "user", "content": content}],
+ )
+ except Exception as exc:
+ logger.error("Appel Mistral chat échoué", extra={"model": model_id, "error": str(exc)})
+ raise RuntimeError(f"Erreur API Mistral ({model_id}) : {exc}") from exc
choices = response.choices or []
if not choices:
return ""
diff --git a/backend/app/services/ai/provider_vertex_key.py b/backend/app/services/ai/provider_vertex_key.py
index 53b3359814c665cfc27dc375abc1c9770c52fc0a..585bab0e8df5ac03d47b6f80731b8358e573cc43 100644
--- a/backend/app/services/ai/provider_vertex_key.py
+++ b/backend/app/services/ai/provider_vertex_key.py
@@ -22,9 +22,6 @@ retourne toujours False afin d'éviter des appels réseau voués à l'échec.
import logging
import os
-# 2. third-party
-from google.genai import types # noqa: F401 (conservé pour import cohérence)
-
# 3. local
from app.schemas.model_config import ModelInfo, ProviderType
from app.services.ai.base import AIProvider
diff --git a/backend/app/services/ai/provider_vertex_sa.py b/backend/app/services/ai/provider_vertex_sa.py
index f76f1b971771db7728dce85f246d50f837e6022b..d57ca58f2206d3eca7b2ff698338eb3d566feb9d 100644
--- a/backend/app/services/ai/provider_vertex_sa.py
+++ b/backend/app/services/ai/provider_vertex_sa.py
@@ -90,8 +90,15 @@ class VertexServiceAccountProvider(AIProvider):
raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
client = self._build_client()
image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
- response = client.models.generate_content(
- model=model_id,
- contents=[image_part, prompt],
- )
+ try:
+ response = client.models.generate_content(
+ model=model_id,
+ contents=[image_part, prompt],
+ )
+ except Exception as exc:
+ logger.error(
+ "Appel API Vertex AI échoué",
+ extra={"model": model_id, "error": str(exc)},
+ )
+ raise RuntimeError(f"Erreur API Vertex AI ({model_id}) : {exc}") from exc
return response.text or ""
diff --git a/backend/app/services/corpus_runner.py b/backend/app/services/corpus_runner.py
index a5fb96d906868a70af888033aa43f9efe285436d..f208827581712514a0a5badd85ec253a783e0d25 100644
--- a/backend/app/services/corpus_runner.py
+++ b/backend/app/services/corpus_runner.py
@@ -17,7 +17,6 @@ from sqlalchemy import select
# 3. local
from app.models.database import async_session_factory
from app.models.job import JobModel
-from app.services.job_runner import execute_page_job
logger = logging.getLogger(__name__)
@@ -54,6 +53,8 @@ async def execute_corpus_job(corpus_id: str) -> dict:
)
# Exécution séquentielle — chaque job gère sa propre session
+ from app.services.job_runner import execute_page_job
+
for job_id in job_ids:
await execute_page_job(job_id)
diff --git a/backend/app/services/export/alto.py b/backend/app/services/export/alto.py
index d465509a39a4d7e42ee078e5dbd6eb1c095d80f7..7cabb52ea9dcb22af61348a6dbee27541a562e76 100644
--- a/backend/app/services/export/alto.py
+++ b/backend/app/services/export/alto.py
@@ -1,7 +1,7 @@
"""
Générateur ALTO v4 depuis un PageMaster validé (R02).
-Source canonique : PageMaster uniquement — jamais la réponse brute gemini_raw.json.
+Source canonique : PageMaster uniquement — jamais la réponse brute ai_raw.json.
bbox [x, y, width, height] → HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
Mapping RegionType → élément ALTO :
@@ -82,7 +82,7 @@ def _build_text_block(
text = fallback_text
if not text:
- return # TextBlock vide — valide ALTO
+ return # TextBlock sans TextLine — valide ALTO, région visible dans le layout
x, y, w, h = region.bbox
line_el = etree.SubElement(
@@ -160,11 +160,7 @@ def generate_alto(master: PageMaster) -> str:
etree.SubElement(desc, _a("MeasurementUnit")).text = "pixel"
src_info = etree.SubElement(desc, _a("sourceImageInformation"))
- file_name = (
- master.image.get("original_url")
- or master.image.get("derivative_web")
- or master.page_id
- )
+ file_name = master.image.master or master.image.derivative_web or master.page_id
etree.SubElement(src_info, _a("fileName")).text = str(file_name)
if master.processing:
@@ -185,8 +181,8 @@ def generate_alto(master: PageMaster) -> str:
# ── Layout ─────────────────────────────────────────────────────────────
layout_el = etree.SubElement(root, _a("Layout"))
- width = int(master.image.get("width", 0))
- height = int(master.image.get("height", 0))
+ width = master.image.width
+ height = master.image.height
page_id_safe = master.page_id.replace(" ", "_")
page_el = etree.SubElement(
diff --git a/backend/app/services/export/iiif.py b/backend/app/services/export/iiif.py
index a887e0ef1eefe2d908703645b0e9aa4d351244ce..2bb58318e293dc42e712a563cf58a2cff3e25256 100644
--- a/backend/app/services/export/iiif.py
+++ b/backend/app/services/export/iiif.py
@@ -74,7 +74,7 @@ def generate_manifest(
manuscript_id = manuscript_meta["manuscript_id"]
label = manuscript_meta["label"]
- language = manuscript_meta.get("language") or "none"
+ language = manuscript_meta.get("language") or "en"
# Pages dans l'ordre de séquence (règle absolue — structMap PHYSICAL)
pages = sorted(masters, key=lambda m: m.sequence)
@@ -102,17 +102,17 @@ def generate_manifest(
canvas_id = (
f"{base_url}/api/v1/manuscripts/{manuscript_id}/canvas/{page.page_id}"
)
- width = int(page.image.get("width", 0))
- height = int(page.image.get("height", 0))
+ width = page.image.width
+ height = page.image.height
annotation_page_id = f"{canvas_id}/annotation-page/1"
annotation_id = f"{canvas_id}/annotation/painting"
- image_url = page.image.get("original_url", "")
+ image_url = page.image.master or ""
canvas: dict = {
"id": canvas_id,
"type": "Canvas",
- "label": {"none": [f"Folio {page.folio_label}"]},
+ "label": {language: [f"Folio {page.folio_label}"]},
"width": width,
"height": height,
"items": [
diff --git a/backend/app/services/export/mets.py b/backend/app/services/export/mets.py
index 3cd128c930c32660f1ceafdea3d9ae3958a0b9a7..160f9457dff6c4a89481b6c63894f321a1866da8 100644
--- a/backend/app/services/export/mets.py
+++ b/backend/app/services/export/mets.py
@@ -182,7 +182,7 @@ def generate_mets(
f_master = _el(grp_master, f"{_M}file", {"ID": f"IMG_MASTER_{sid}", "MIMETYPE": "image/jpeg"})
_el(f_master, f"{_M}FLocat", {
"LOCTYPE": "URL",
- f"{_XL}href": page.image.get("original_url", ""),
+ f"{_XL}href": page.image.master or "",
f"{_XL}type": "simple",
})
@@ -191,12 +191,17 @@ def generate_mets(
_el(f_deriv, f"{_M}FLocat", {
"LOCTYPE": "OTHER",
"OTHERLOCTYPE": "filepath",
- f"{_XL}href": page.image.get("derivative_web", ""),
+ f"{_XL}href": page.image.derivative_web or "",
f"{_XL}type": "simple",
})
- # ALTO
+ # ALTO (référence conditionnelle — warning si le fichier n'existe pas encore)
alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
+ if not Path(alto_p).exists():
+ logger.warning(
+ "Fichier ALTO absent — la référence METS sera cassée tant que l'ALTO n'est pas généré",
+ extra={"alto_path": alto_p, "page_id": page.page_id},
+ )
f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
_el(f_alto, f"{_M}FLocat", {
"LOCTYPE": "OTHER",
diff --git a/backend/app/services/ingest/iiif_fetcher.py b/backend/app/services/ingest/iiif_fetcher.py
index 9897e8db3cb473dc320309a6e95059fb7b8c3715..d483852e3a97a1a99af50d42f7934a0fdabc5697 100644
--- a/backend/app/services/ingest/iiif_fetcher.py
+++ b/backend/app/services/ingest/iiif_fetcher.py
@@ -17,7 +17,6 @@ _HEADERS = {
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
),
"Accept": "image/jpeg,image/png,image/*,*/*",
- "Referer": "https://gallica.bnf.fr/",
}
diff --git a/backend/app/services/job_runner.py b/backend/app/services/job_runner.py
index 6ac201671fb9352a8058638a2f7dae188bc3477a..c2101b332ebef85472fb834ced5ae45c78342888 100644
--- a/backend/app/services/job_runner.py
+++ b/backend/app/services/job_runner.py
@@ -33,8 +33,6 @@ from app.models.job import JobModel
from app.models.model_config_db import ModelConfigDB
from app.schemas.corpus_profile import CorpusProfile
from app.schemas.model_config import ModelConfig, ProviderType
-from app.services.ai.analyzer import run_primary_analysis
-from app.services.export.alto import generate_alto, write_alto
from app.services.image.normalizer import create_derivatives, fetch_and_normalize
logger = logging.getLogger(__name__)
@@ -148,6 +146,8 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
)
# ── 6. Analyse primaire IA (R05 : double stockage) ───────────────────
+ from app.services.ai.analyzer import run_primary_analysis
+
page_master = run_primary_analysis(
derivative_image_path=Path(image_info.derivative_path),
corpus_profile=corpus_profile,
@@ -163,6 +163,8 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
)
# ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
+ from app.services.export.alto import generate_alto, write_alto
+
alto_xml = generate_alto(page_master)
alto_path = (
data_dir
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index e0b2d2d7e15794768ce243ad680cec18422c9a1b..3425990d3bb7ab079c308d72e49ad36fb2429bb8 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
"fastapi>=0.111",
"uvicorn[standard]>=0.29",
"pydantic>=2.7",
+ "pydantic-settings>=2.0",
"sqlalchemy>=2.0",
"aiosqlite>=0.20",
"google-genai>=1.0",
diff --git a/backend/tests/conftest_api.py b/backend/tests/conftest_api.py
index b84ff9ea435a0d923d90ace58ff959aa1a28a1d8..82aeb9d5724603dc8bcbc0791e635542d94b8359 100644
--- a/backend/tests/conftest_api.py
+++ b/backend/tests/conftest_api.py
@@ -51,10 +51,11 @@ async def async_client(db_session: AsyncSession):
app.dependency_overrides[get_db] = _override_get_db
# Les background tasks (execute_corpus_job, execute_page_job) créent leur
- # propre session via async_session_factory. On les neutralise pour éviter
- # qu'elles tentent de se connecter à la BDD réelle pendant les tests d'API.
- with patch("app.api.v1.jobs.execute_corpus_job", AsyncMock(return_value=None)), \
- patch("app.api.v1.jobs.execute_page_job", AsyncMock(return_value=None)):
+ # propre session via async_session_factory. On les neutralise en mockant
+ # les modules sources pour éviter qu'elles tentent de se connecter à la
+ # BDD réelle pendant les tests d'API.
+ with patch("app.services.corpus_runner.execute_corpus_job", AsyncMock(return_value={"total": 0, "done": 0, "failed": 0})), \
+ patch("app.services.job_runner.execute_page_job", AsyncMock(return_value=None)):
async with AsyncClient(
transport=ASGITransport(app=app), base_url="http://test"
) as client:
diff --git a/backend/tests/test_ai_analyzer.py b/backend/tests/test_ai_analyzer.py
index e5b5c545787f30a9b8c8060e4fdf1191b31845d2..dd98516fa575a76824c47f25dfd102f89379c218 100644
--- a/backend/tests/test_ai_analyzer.py
+++ b/backend/tests/test_ai_analyzer.py
@@ -3,7 +3,7 @@ Tests du pipeline d'analyse IA :
- prompt_loader : chargement + rendu des templates
- client_factory : construction du genai.Client selon le provider
- response_parser: parsing JSON brut → layout + OCRResult
- - master_writer : écriture gemini_raw.json et master.json
+ - master_writer : écriture ai_raw.json et master.json
- analyzer : run_primary_analysis (end-to-end mocké)
"""
# 1. stdlib
@@ -31,7 +31,7 @@ from app.schemas.model_config import ModelConfig, ProviderType
from app.schemas.page_master import OCRResult, PageMaster
from app.services.ai.analyzer import run_primary_analysis
from app.services.ai.client_factory import build_client
-from app.services.ai.master_writer import write_gemini_raw, write_master_json
+from app.services.ai.master_writer import write_ai_raw, write_master_json
from app.services.ai.prompt_loader import load_and_render_prompt
from app.services.ai.response_parser import ParseError, parse_ai_response
@@ -390,35 +390,35 @@ def test_parse_empty_regions_list():
# ---------------------------------------------------------------------------
-# Tests — write_gemini_raw / write_master_json
+# Tests — write_ai_raw / write_master_json
# ---------------------------------------------------------------------------
-def test_write_gemini_raw_creates_file(tmp_path):
- out = tmp_path / "page" / "gemini_raw.json"
- write_gemini_raw("raw AI text here", out)
+def test_write_ai_raw_creates_file(tmp_path):
+ out = tmp_path / "page" / "ai_raw.json"
+ write_ai_raw("raw AI text here", out)
assert out.exists()
-def test_write_gemini_raw_valid_json(tmp_path):
- out = tmp_path / "gemini_raw.json"
- write_gemini_raw('{"not": "valid json from AI"}', out)
+def test_write_ai_raw_valid_json(tmp_path):
+ out = tmp_path / "ai_raw.json"
+ write_ai_raw('{"not": "valid json from AI"}', out)
content = json.loads(out.read_text(encoding="utf-8"))
assert "response_text" in content
assert content["response_text"] == '{"not": "valid json from AI"}'
-def test_write_gemini_raw_creates_parent_dirs(tmp_path):
- out = tmp_path / "deep" / "nested" / "dir" / "gemini_raw.json"
- write_gemini_raw("text", out)
+def test_write_ai_raw_creates_parent_dirs(tmp_path):
+ out = tmp_path / "deep" / "nested" / "dir" / "ai_raw.json"
+ write_ai_raw("text", out)
assert out.exists()
-def test_write_gemini_raw_with_non_json_text(tmp_path):
- """Même si le texte brut est invalide, gemini_raw.json est créé."""
- out = tmp_path / "gemini_raw.json"
- write_gemini_raw("this is not json at all", out)
+def test_write_ai_raw_with_non_json_text(tmp_path):
+ """Même si le texte brut est invalide, ai_raw.json est créé."""
+ out = tmp_path / "ai_raw.json"
+ write_ai_raw("this is not json at all", out)
content = json.loads(out.read_text(encoding="utf-8"))
assert content["response_text"] == "this is not json at all"
@@ -432,7 +432,7 @@ def _make_page_master() -> PageMaster:
folio_label="0001r",
sequence=1,
image={
- "original_url": "https://example.com/img.jpg",
+ "master": "https://example.com/img.jpg",
"derivative_web": "/data/deriv.jpg",
"thumbnail": "/data/thumb.jpg",
"width": 1500,
@@ -440,10 +440,11 @@ def _make_page_master() -> PageMaster:
},
layout={"regions": []},
processing={
+ "provider": "google_ai_studio",
"model_id": "gemini-2.0-flash",
"model_display_name": "Gemini 2.0 Flash",
"prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
- "raw_response_path": "/data/gemini_raw.json",
+ "raw_response_path": "/data/ai_raw.json",
"processed_at": datetime.now(tz=timezone.utc),
},
)
@@ -568,12 +569,12 @@ def test_run_primary_analysis_files_created(tmp_path):
)
page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
- assert (page_dir / "gemini_raw.json").exists()
+ assert (page_dir / "ai_raw.json").exists()
assert (page_dir / "master.json").exists()
def test_run_primary_analysis_raw_written_before_parse(tmp_path):
- """gemini_raw.json est écrit AVANT que le parsing échoue (R05)."""
+ """ai_raw.json est écrit AVANT que le parsing échoue (R05)."""
prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
_setup_prompt_file(tmp_path, prompt_rel)
deriv_path = _setup_derivative(tmp_path)
@@ -596,8 +597,8 @@ def test_run_primary_analysis_raw_written_before_parse(tmp_path):
project_root=tmp_path,
)
- # gemini_raw.json existe malgré l'échec de parsing
- raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "gemini_raw.json"
+ # ai_raw.json existe malgré l'échec de parsing
+ raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "ai_raw.json"
assert raw_path.exists()
# master.json N'existe PAS (parsing a échoué)
@@ -663,9 +664,9 @@ def test_run_primary_analysis_image_dict(tmp_path):
project_root=tmp_path,
)
- assert result.image["original_url"] == image_info.original_url
- assert result.image["width"] == image_info.derivative_width
- assert result.image["height"] == image_info.derivative_height
+ assert result.image.master == image_info.original_url
+ assert result.image.width == image_info.derivative_width
+ assert result.image.height == image_info.derivative_height
def test_run_primary_analysis_regions_in_layout(tmp_path):
diff --git a/backend/tests/test_api_corrections.py b/backend/tests/test_api_corrections.py
index 38e3140e9ead73b43bbc6776cd606031e51f371c..0e2847f2d4eafca6331920f291a507ace0bdaff1 100644
--- a/backend/tests/test_api_corrections.py
+++ b/backend/tests/test_api_corrections.py
@@ -75,7 +75,7 @@ def _make_master(
"manuscript_id": "ms-test",
"folio_label": "f001r",
"sequence": 1,
- "image": {"original_url": "https://example.com/f.jpg", "width": 1500, "height": 2000},
+ "image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
"layout": {"regions": []},
"ocr": {
"diplomatic_text": "Incipit liber primus",
@@ -238,13 +238,13 @@ async def test_corrections_archives_old_version(async_client, db_session, monkey
ms = await _create_manuscript(db_session, corpus.id)
page = await _create_page(db_session, ms.id)
- written_paths: list[str] = []
+ written_data: dict[str, str] = {}
monkeypatch.setattr(Path, "exists", lambda self: True)
monkeypatch.setattr(Path, "read_text", lambda self, **kw: _make_master(page.id, version=1))
def _capture_write(self: Path, content: str, **kw: object) -> None:
- written_paths.append(str(self))
+ written_data[str(self)] = content
monkeypatch.setattr(Path, "write_text", _capture_write)
@@ -254,10 +254,17 @@ async def test_corrections_archives_old_version(async_client, db_session, monkey
)
# Deux écritures attendues : master_v1.json (archive) + master.json (nouveau)
+ written_paths = list(written_data.keys())
assert len(written_paths) >= 2
assert any("master_v1.json" in p for p in written_paths)
assert any("master.json" in p and "master_v" not in p for p in written_paths)
+ # Vérifier que l'archive contient bien la version originale (v1)
+ import json as _json
+ archive_path = next(p for p in written_paths if "master_v1.json" in p)
+ archive_data = _json.loads(written_data[archive_path])
+ assert archive_data["editorial"]["version"] == 1
+
@pytest.mark.asyncio
async def test_corrections_multiple_fields(async_client, db_session, monkeypatch):
diff --git a/backend/tests/test_api_export.py b/backend/tests/test_api_export.py
index b95e89a01a007ad4b80d3a7f2072a0038995a650..f1b6e9959946c4256dccf56c3807f9b24f316b5b 100644
--- a/backend/tests/test_api_export.py
+++ b/backend/tests/test_api_export.py
@@ -83,7 +83,7 @@ def _make_master_json(page_id: str, folio_label: str, sequence: int) -> str:
"folio_label": folio_label,
"sequence": sequence,
"image": {
- "original_url": f"https://example.com/{page_id}.jpg",
+ "master": f"https://example.com/{page_id}.jpg",
"derivative_web": f"/data/deriv/{page_id}.jpg",
"thumbnail": f"/data/thumb/{page_id}.jpg",
"width": 1500,
diff --git a/backend/tests/test_api_ingest.py b/backend/tests/test_api_ingest.py
index e7b1b70d714fddc72179e801a0a0abfe66e9b78e..120298debb902867bf6dcc48d36e5ca92895c976 100644
--- a/backend/tests/test_api_ingest.py
+++ b/backend/tests/test_api_ingest.py
@@ -457,6 +457,15 @@ async def test_reingest_manifest_skips_existing_pages(async_client, db_session,
assert data2["pages_created"] == 0
assert data2["pages_skipped"] == 2
+ # Vérifier que la BDD n'a bien que 2 pages (pas de doublons)
+ from sqlalchemy import select as sa_select
+ from app.models.corpus import PageModel
+ page_result = await db_session.execute(
+ sa_select(PageModel).where(PageModel.manuscript_id == data1["manuscript_id"])
+ )
+ pages_in_db = list(page_result.scalars().all())
+ assert len(pages_in_db) == 2
+
@pytest.mark.asyncio
async def test_reingest_images_skips_existing_pages(async_client, db_session):
diff --git a/backend/tests/test_api_models.py b/backend/tests/test_api_models.py
index 5719da4d7af9a377e68ee6d44820c6c6ff675f29..67bc5d9280496b5ae7104d3d5b46fa6b5144899c 100644
--- a/backend/tests/test_api_models.py
+++ b/backend/tests/test_api_models.py
@@ -94,7 +94,7 @@ async def test_get_models_endpoint_removed(async_client):
@pytest.mark.asyncio
async def test_refresh_models_ok(async_client, monkeypatch):
monkeypatch.setattr(
- models_api_module, "list_all_models", lambda: _MOCK_MODELS
+ "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
)
response = await async_client.post("/api/v1/models/refresh")
assert response.status_code == 200
@@ -103,7 +103,7 @@ async def test_refresh_models_ok(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_refresh_models_has_timestamp(async_client, monkeypatch):
monkeypatch.setattr(
- models_api_module, "list_all_models", lambda: _MOCK_MODELS
+ "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
)
data = (await async_client.post("/api/v1/models/refresh")).json()
assert "refreshed_at" in data
@@ -113,7 +113,7 @@ async def test_refresh_models_has_timestamp(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_refresh_models_count(async_client, monkeypatch):
monkeypatch.setattr(
- models_api_module, "list_all_models", lambda: _MOCK_MODELS
+ "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
)
data = (await async_client.post("/api/v1/models/refresh")).json()
assert data["count"] == 2
@@ -123,7 +123,7 @@ async def test_refresh_models_count(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_refresh_models_structure(async_client, monkeypatch):
monkeypatch.setattr(
- models_api_module, "list_all_models", lambda: _MOCK_MODELS
+ "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
)
data = (await async_client.post("/api/v1/models/refresh")).json()
assert "models" in data
diff --git a/backend/tests/test_api_pages.py b/backend/tests/test_api_pages.py
index 63f30218bc2338102aa8558d2d1c049222771c8f..ef99932efc344c0ed3c23a9930d3caaced3a43c1 100644
--- a/backend/tests/test_api_pages.py
+++ b/backend/tests/test_api_pages.py
@@ -87,7 +87,7 @@ def _make_master_json(page_id: str, corpus_profile: str = "medieval-illuminated"
"folio_label": "f001r",
"sequence": 1,
"image": {
- "original_url": "https://example.com/f001r.jpg",
+ "master": "https://example.com/f001r.jpg",
"derivative_web": "/data/deriv/f001r.jpg",
"thumbnail": "/data/thumb/f001r.jpg",
"width": 1500,
diff --git a/backend/tests/test_api_providers.py b/backend/tests/test_api_providers.py
index 9572948a1216207fb22f80d4ab2dc024432bcc42..ea550a2a61c61cab8810d7ddde61bd9b1abad624 100644
--- a/backend/tests/test_api_providers.py
+++ b/backend/tests/test_api_providers.py
@@ -90,7 +90,7 @@ _MOCK_MISTRAL_MODELS = [
@pytest.mark.asyncio
async def test_list_providers_returns_list(async_client, monkeypatch):
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
resp = await async_client.get("/api/v1/providers")
assert resp.status_code == 200
assert isinstance(resp.json(), list)
@@ -98,14 +98,14 @@ async def test_list_providers_returns_list(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_list_providers_count(async_client, monkeypatch):
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
data = (await async_client.get("/api/v1/providers")).json()
assert len(data) == 4 # 4 providers connus
@pytest.mark.asyncio
async def test_list_providers_fields(async_client, monkeypatch):
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
data = (await async_client.get("/api/v1/providers")).json()
p = data[0]
assert "provider_type" in p
@@ -116,7 +116,7 @@ async def test_list_providers_fields(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_list_providers_all_unavailable(async_client, monkeypatch):
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
data = (await async_client.get("/api/v1/providers")).json()
assert all(not p["available"] for p in data)
assert all(p["model_count"] == 0 for p in data)
@@ -124,7 +124,7 @@ async def test_list_providers_all_unavailable(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_list_providers_google_available(async_client, monkeypatch):
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_GOOGLE_ONLY)
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_GOOGLE_ONLY)
data = (await async_client.get("/api/v1/providers")).json()
google = next(p for p in data if p["provider_type"] == "google_ai_studio")
assert google["available"] is True
@@ -133,7 +133,7 @@ async def test_list_providers_google_available(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_list_providers_mistral_available(async_client, monkeypatch):
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_GOOGLE_AND_MISTRAL)
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_GOOGLE_AND_MISTRAL)
data = (await async_client.get("/api/v1/providers")).json()
mistral = next(p for p in data if p["provider_type"] == "mistral")
assert mistral["available"] is True
@@ -143,7 +143,7 @@ async def test_list_providers_mistral_available(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
"""Mistral est toujours dans la liste même si indisponible."""
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
data = (await async_client.get("/api/v1/providers")).json()
types_ = [p["provider_type"] for p in data]
assert "mistral" in types_
@@ -156,7 +156,7 @@ async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_get_provider_models_google(async_client, monkeypatch):
monkeypatch.setattr(
- models_api_module, "list_models_for_provider", lambda ptype: _MOCK_GOOGLE_MODELS
+ "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_GOOGLE_MODELS
)
resp = await async_client.get("/api/v1/providers/google_ai_studio/models")
assert resp.status_code == 200
@@ -166,7 +166,7 @@ async def test_get_provider_models_google(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_get_provider_models_mistral(async_client, monkeypatch):
monkeypatch.setattr(
- models_api_module, "list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
+ "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
)
resp = await async_client.get("/api/v1/providers/mistral/models")
assert resp.status_code == 200
@@ -189,7 +189,7 @@ async def test_get_provider_models_not_configured(async_client, monkeypatch):
def _raise(ptype):
raise RuntimeError("Variable d'environnement manquante : MISTRAL_API_KEY")
- monkeypatch.setattr(models_api_module, "list_models_for_provider", _raise)
+ monkeypatch.setattr("app.services.ai.model_registry.list_models_for_provider", _raise)
resp = await async_client.get("/api/v1/providers/mistral/models")
assert resp.status_code == 503
@@ -197,7 +197,7 @@ async def test_get_provider_models_not_configured(async_client, monkeypatch):
@pytest.mark.asyncio
async def test_get_provider_models_fields(async_client, monkeypatch):
monkeypatch.setattr(
- models_api_module, "list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
+ "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
)
data = (await async_client.get("/api/v1/providers/mistral/models")).json()
m = data[0]
diff --git a/backend/tests/test_api_search.py b/backend/tests/test_api_search.py
index fe6bdd89f9e0a35e5ee9c52fd98dd12f2ee7a8a5..9f491ab422827daed423f0bda78698acbf0114d2 100644
--- a/backend/tests/test_api_search.py
+++ b/backend/tests/test_api_search.py
@@ -33,7 +33,7 @@ def _make_master(page_id: str, diplomatic_text: str = "", translation_fr: str =
"manuscript_id": "ms-test",
"folio_label": "f001r",
"sequence": 1,
- "image": {"original_url": "https://example.com/f.jpg", "width": 1500, "height": 2000},
+ "image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
"layout": {"regions": []},
"ocr": {
"diplomatic_text": diplomatic_text,
diff --git a/backend/tests/test_export_alto.py b/backend/tests/test_export_alto.py
index 748961bc3662cda39493a76e8a7c36bbe78fd7b7..57c2f4482dbf7192ebf56cc3b7fb25729d7c6943 100644
--- a/backend/tests/test_export_alto.py
+++ b/backend/tests/test_export_alto.py
@@ -52,10 +52,11 @@ def _make_master(
processing = None
if with_processing:
processing = ProcessingInfo(
+ provider="google_ai_studio",
model_id="gemini-2.0-flash",
model_display_name="Gemini 2.0 Flash",
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
- raw_response_path="/data/gemini_raw.json",
+ raw_response_path="/data/ai_raw.json",
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
)
return PageMaster(
@@ -65,7 +66,7 @@ def _make_master(
folio_label="0001r",
sequence=sequence,
image={
- "original_url": "https://example.com/img.jpg",
+ "master": "https://example.com/img.jpg",
"derivative_web": "/data/deriv.jpg",
"thumbnail": "/data/thumb.jpg",
"width": width,
diff --git a/backend/tests/test_export_iiif.py b/backend/tests/test_export_iiif.py
index 45d19a5ea7334b1245463ce917a1402c270ba10d..a2b875682263bbf94d3cf6aed77b118675a03a98 100644
--- a/backend/tests/test_export_iiif.py
+++ b/backend/tests/test_export_iiif.py
@@ -53,7 +53,7 @@ def _make_page(
folio_label=folio_label,
sequence=sequence,
image={
- "original_url": original_url or f"https://example.com/{folio_label}.jpg",
+ "master": original_url or f"https://example.com/{folio_label}.jpg",
"derivative_web": f"/data/deriv/{folio_label}.jpg",
"thumbnail": f"/data/thumb/{folio_label}.jpg",
"width": width,
@@ -193,11 +193,11 @@ def test_manifest_label_uses_language_key(simple_manifest):
def test_manifest_label_without_language_uses_none():
- """Sans champ language, la clé de label est 'none'."""
+ """Sans champ language, la clé de label est 'en' (défaut IIIF-compliant)."""
pages = [_make_page("ms-0001r", "0001r", 1)]
- meta = _base_meta() # pas de language
+ meta = _base_meta() # pas de language → défaut "en"
manifest = generate_manifest(pages, meta, _BASE_URL)
- assert "none" in manifest["label"]
+ assert "en" in manifest["label"]
def test_manifest_label_fr(chroniques_pages, chroniques_meta):
@@ -272,7 +272,7 @@ def test_canvas_order_respects_sequence():
_make_page("ms-f002r", "f002r", 2),
]
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
- labels = [c["label"]["none"][0] for c in manifest["items"]]
+ labels = [c["label"]["en"][0] for c in manifest["items"]]
assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
@@ -283,7 +283,7 @@ def test_canvas_order_large_sequence():
random.shuffle(pages)
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
sequences_in_label = [
- int(c["label"]["none"][0].replace("Folio f", "").replace("r", ""))
+ int(c["label"]["en"][0].replace("Folio f", "").replace("r", ""))
for c in manifest["items"]
]
assert sequences_in_label == list(range(1, 11))
@@ -344,7 +344,7 @@ def test_canvas_width_matches_image(beatus_pages, beatus_meta):
# Trouve la page correspondante
page_id = canvas["id"].split("/canvas/")[-1]
page = next(p for p in beatus_pages if p.page_id == page_id)
- assert canvas["width"] == page.image["width"]
+ assert canvas["width"] == page.image.width
def test_canvas_height_matches_image(beatus_pages, beatus_meta):
@@ -352,7 +352,7 @@ def test_canvas_height_matches_image(beatus_pages, beatus_meta):
for canvas in manifest["items"]:
page_id = canvas["id"].split("/canvas/")[-1]
page = next(p for p in beatus_pages if p.page_id == page_id)
- assert canvas["height"] == page.image["height"]
+ assert canvas["height"] == page.image.height
def test_canvas_dimensions_beatus_hr():
@@ -447,7 +447,7 @@ def test_annotation_body_id_is_original_url(beatus_pages, beatus_meta):
page_id = canvas["id"].split("/canvas/")[-1]
page = next(p for p in beatus_pages if p.page_id == page_id)
body = canvas["items"][0]["items"][0]["body"]
- assert body["id"] == page.image["original_url"]
+ assert body["id"] == page.image.master
def test_annotation_body_contains_gallica_url(beatus_pages, beatus_meta):
@@ -480,7 +480,10 @@ def test_base_url_trailing_slash_stripped():
"""Un base_url avec slash final ne génère pas de double slash dans les IDs."""
pages = [_make_page("ms-0001r", "0001r", 1)]
manifest = generate_manifest(pages, _base_meta(), "https://example.com/")
- assert "//" not in manifest["id"].replace("://", "X")
+ manifest_id = manifest["id"]
+ # Retirer le protocole puis vérifier qu'il n'y a pas de double slash
+ without_protocol = manifest_id.split("://", 1)[1]
+ assert "//" not in without_protocol
# ---------------------------------------------------------------------------
diff --git a/backend/tests/test_export_mets.py b/backend/tests/test_export_mets.py
index 748119e45a2310f0d728807ff711a75dcddd61a9..5b2ea18b9d7ae8baae805bcf53fa1744c89b61d3 100644
--- a/backend/tests/test_export_mets.py
+++ b/backend/tests/test_export_mets.py
@@ -66,10 +66,11 @@ def _make_page(
processing = None
if with_processing:
processing = ProcessingInfo(
+ provider="google_ai_studio",
model_id="gemini-2.0-flash",
model_display_name="Gemini 2.0 Flash",
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
- raw_response_path=f"/data/corpora/test/pages/{folio_label}/gemini_raw.json",
+ raw_response_path=f"/data/corpora/test/pages/{folio_label}/ai_raw.json",
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
)
ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
@@ -80,7 +81,7 @@ def _make_page(
folio_label=folio_label,
sequence=sequence,
image={
- "original_url": original_url or f"https://example.com/{folio_label}.jpg",
+ "master": original_url or f"https://example.com/{folio_label}.jpg",
"derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg",
"thumbnail": f"/data/thumb/{folio_label}.jpg",
"width": 1500,
@@ -194,7 +195,9 @@ def test_generate_mets_namespace(beatus_pages, beatus_meta):
def test_generate_mets_objid(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
- assert root.get("OBJID") == "BnF-Latin-8878"
+ objid = root.get("OBJID")
+ assert objid is not None, "OBJID attribute absent du root mets"
+ assert objid == "BnF-Latin-8878"
def test_generate_mets_label(beatus_pages, beatus_meta):
diff --git a/backend/tests/test_image_pipeline.py b/backend/tests/test_image_pipeline.py
index 7e4481419d7849408f2ca98fa63aec7299e46289..c2599b9bc422bc8c80f40ce8cc714b6dc5f38344 100644
--- a/backend/tests/test_image_pipeline.py
+++ b/backend/tests/test_image_pipeline.py
@@ -278,7 +278,6 @@ def test_fetch_iiif_image_success():
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
),
"Accept": "image/jpeg,image/png,image/*,*/*",
- "Referer": "https://gallica.bnf.fr/",
},
follow_redirects=True,
timeout=60.0,
diff --git a/backend/tests/test_job_runner.py b/backend/tests/test_job_runner.py
index 903d88bae7232007695261c4b41ddd866d1fb08b..90bff4526fedb6863d040a90123d5f3d5a26fba3 100644
--- a/backend/tests/test_job_runner.py
+++ b/backend/tests/test_job_runner.py
@@ -142,16 +142,24 @@ def _page_master(page_id: str, ms_id: str) -> PageMaster:
def _apply_success_mocks(monkeypatch, page_id: str, ms_id: str) -> None:
- """Applique les mocks IO pour un pipeline réussi."""
+ """Applique les mocks IO pour un pipeline réussi.
+
+ Les imports sont différés dans job_runner (lazy imports). On patche donc
+ les modules sources pour que le import dans la fonction cible récupère le mock.
+ """
monkeypatch.setattr(
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
)
monkeypatch.setattr(
- job_runner_module, "run_primary_analysis",
+ "app.services.ai.analyzer.run_primary_analysis",
lambda **kw: _page_master(page_id, ms_id),
)
- monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "