Spaces:
Build error
Build error
Merge pull request #30 from maribakulj/claude/code-review-analysis-qDhlH
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- Dockerfile +1 -1
- backend/app/api/v1/corpora.py +12 -8
- backend/app/api/v1/export.py +10 -4
- backend/app/api/v1/ingest.py +80 -11
- backend/app/api/v1/jobs.py +6 -2
- backend/app/api/v1/models_api.py +10 -9
- backend/app/api/v1/pages.py +6 -6
- backend/app/api/v1/profiles.py +25 -8
- backend/app/api/v1/search.py +29 -24
- backend/app/config.py +10 -24
- backend/app/main.py +5 -4
- backend/app/models/corpus.py +7 -2
- backend/app/models/job.py +5 -2
- backend/app/schemas/page_master.py +24 -5
- backend/app/services/ai/__init__.py +23 -11
- backend/app/services/ai/analyzer.py +22 -15
- backend/app/services/ai/master_writer.py +24 -24
- backend/app/services/ai/model_registry.py +18 -5
- backend/app/services/ai/prompt_loader.py +6 -0
- backend/app/services/ai/provider_google_ai.py +11 -4
- backend/app/services/ai/provider_mistral.py +16 -8
- backend/app/services/ai/provider_vertex_key.py +0 -3
- backend/app/services/ai/provider_vertex_sa.py +11 -4
- backend/app/services/corpus_runner.py +2 -1
- backend/app/services/export/alto.py +5 -9
- backend/app/services/export/iiif.py +5 -5
- backend/app/services/export/mets.py +8 -3
- backend/app/services/ingest/iiif_fetcher.py +0 -1
- backend/app/services/job_runner.py +4 -2
- backend/pyproject.toml +1 -0
- backend/tests/conftest_api.py +5 -4
- backend/tests/test_ai_analyzer.py +26 -25
- backend/tests/test_api_corrections.py +10 -3
- backend/tests/test_api_export.py +1 -1
- backend/tests/test_api_ingest.py +9 -0
- backend/tests/test_api_models.py +4 -4
- backend/tests/test_api_pages.py +1 -1
- backend/tests/test_api_providers.py +11 -11
- backend/tests/test_api_search.py +1 -1
- backend/tests/test_export_alto.py +3 -2
- backend/tests/test_export_iiif.py +13 -10
- backend/tests/test_export_mets.py +6 -3
- backend/tests/test_image_pipeline.py +0 -1
- backend/tests/test_job_runner.py +28 -16
- backend/tests/test_security.py +215 -0
- frontend/src/App.tsx +1 -0
- frontend/src/lib/api.ts +8 -0
- frontend/src/pages/Editor.tsx +1 -1
- frontend/src/pages/Home.tsx +3 -2
- infra/Dockerfile +0 -71
Dockerfile
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# Scriptorium AI β image de production (multi-stage)
|
| 2 |
# Ce fichier est utilisΓ© par HuggingFace Spaces (SDK docker, dΓ©tection automatique).
|
| 3 |
-
#
|
| 4 |
#
|
| 5 |
# Build depuis la racine du dΓ©pΓ΄t :
|
| 6 |
# docker build -t scriptorium-ai .
|
|
|
|
| 1 |
# Scriptorium AI β image de production (multi-stage)
|
| 2 |
# Ce fichier est utilisΓ© par HuggingFace Spaces (SDK docker, dΓ©tection automatique).
|
| 3 |
+
# Source unique β le fichier infra/Dockerfile a Γ©tΓ© supprimΓ© pour Γ©viter la divergence.
|
| 4 |
#
|
| 5 |
# Build depuis la racine du dΓ©pΓ΄t :
|
| 6 |
# docker build -t scriptorium-ai .
|
backend/app/api/v1/corpora.py
CHANGED
|
@@ -14,8 +14,8 @@ import uuid
|
|
| 14 |
from datetime import datetime, timezone
|
| 15 |
|
| 16 |
# 2. third-party
|
| 17 |
-
from fastapi import APIRouter, Depends, HTTPException
|
| 18 |
-
from pydantic import BaseModel, ConfigDict
|
| 19 |
from sqlalchemy import select
|
| 20 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 21 |
|
|
@@ -29,9 +29,9 @@ router = APIRouter(prefix="/corpora", tags=["corpora"])
|
|
| 29 |
# ββ SchΓ©mas de requΓͺte / rΓ©ponse βββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
|
| 31 |
class CorpusCreate(BaseModel):
|
| 32 |
-
slug: str
|
| 33 |
-
title: str
|
| 34 |
-
profile_id: str
|
| 35 |
|
| 36 |
|
| 37 |
class CorpusResponse(BaseModel):
|
|
@@ -59,9 +59,13 @@ class ManuscriptResponse(BaseModel):
|
|
| 59 |
# ββ Endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 60 |
|
| 61 |
@router.get("", response_model=list[CorpusResponse])
|
| 62 |
-
async def list_corpora(
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
return list(result.scalars().all())
|
| 66 |
|
| 67 |
|
|
|
|
| 14 |
from datetime import datetime, timezone
|
| 15 |
|
| 16 |
# 2. third-party
|
| 17 |
+
from fastapi import APIRouter, Depends, HTTPException, Query
|
| 18 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 19 |
from sqlalchemy import select
|
| 20 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 21 |
|
|
|
|
| 29 |
# ββ SchΓ©mas de requΓͺte / rΓ©ponse βββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
|
| 31 |
class CorpusCreate(BaseModel):
|
| 32 |
+
slug: str = Field(..., pattern=r"^[a-z0-9][a-z0-9_-]{0,63}$")
|
| 33 |
+
title: str = Field(..., min_length=1, max_length=256)
|
| 34 |
+
profile_id: str = Field(..., pattern=r"^[a-z0-9][a-z0-9_-]*$")
|
| 35 |
|
| 36 |
|
| 37 |
class CorpusResponse(BaseModel):
|
|
|
|
| 59 |
# ββ Endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 60 |
|
| 61 |
@router.get("", response_model=list[CorpusResponse])
|
| 62 |
+
async def list_corpora(
|
| 63 |
+
db: AsyncSession = Depends(get_db),
|
| 64 |
+
skip: int = Query(0, ge=0, description="Nombre d'Γ©lΓ©ments Γ sauter"),
|
| 65 |
+
limit: int = Query(100, ge=1, le=1000, description="Nombre maximum d'Γ©lΓ©ments"),
|
| 66 |
+
) -> list[CorpusModel]:
|
| 67 |
+
"""Retourne les corpus enregistrΓ©s (paginΓ©)."""
|
| 68 |
+
result = await db.execute(select(CorpusModel).offset(skip).limit(limit))
|
| 69 |
return list(result.scalars().all())
|
| 70 |
|
| 71 |
|
backend/app/api/v1/export.py
CHANGED
|
@@ -10,6 +10,7 @@ Règle (R02) : toutes les sorties sont générées depuis les PageMasters
|
|
| 10 |
(master.json), jamais depuis les rΓ©ponses brutes de l'IA.
|
| 11 |
"""
|
| 12 |
# 1. stdlib
|
|
|
|
| 13 |
import io
|
| 14 |
import json
|
| 15 |
import logging
|
|
@@ -66,7 +67,7 @@ async def _load_manuscript_with_masters(
|
|
| 66 |
|
| 67 |
masters: list[PageMaster] = []
|
| 68 |
for page in pages:
|
| 69 |
-
master = _read_master_json(corpus.slug, page.id)
|
| 70 |
if master is not None:
|
| 71 |
masters.append(master)
|
| 72 |
|
|
@@ -79,8 +80,8 @@ async def _load_manuscript_with_masters(
|
|
| 79 |
return manuscript, corpus, masters
|
| 80 |
|
| 81 |
|
| 82 |
-
def
|
| 83 |
-
"""Lit le master.json d'une page depuis data/. Retourne None si absent."""
|
| 84 |
path = (
|
| 85 |
_config_module.settings.data_dir
|
| 86 |
/ "corpora"
|
|
@@ -95,6 +96,11 @@ def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
|
|
| 95 |
return PageMaster.model_validate(raw)
|
| 96 |
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
def _build_manuscript_meta(
|
| 99 |
manuscript: ManuscriptModel, corpus: CorpusModel
|
| 100 |
) -> dict:
|
|
@@ -154,7 +160,7 @@ async def get_alto(page_id: str, db: AsyncSession = Depends(get_db)) -> Response
|
|
| 154 |
manuscript = await db.get(ManuscriptModel, page.manuscript_id)
|
| 155 |
corpus = await db.get(CorpusModel, manuscript.corpus_id)
|
| 156 |
|
| 157 |
-
master = _read_master_json(corpus.slug, page_id)
|
| 158 |
if master is None:
|
| 159 |
raise HTTPException(
|
| 160 |
status_code=404,
|
|
|
|
| 10 |
(master.json), jamais depuis les rΓ©ponses brutes de l'IA.
|
| 11 |
"""
|
| 12 |
# 1. stdlib
|
| 13 |
+
import asyncio
|
| 14 |
import io
|
| 15 |
import json
|
| 16 |
import logging
|
|
|
|
| 67 |
|
| 68 |
masters: list[PageMaster] = []
|
| 69 |
for page in pages:
|
| 70 |
+
master = await _read_master_json(corpus.slug, page.id)
|
| 71 |
if master is not None:
|
| 72 |
masters.append(master)
|
| 73 |
|
|
|
|
| 80 |
return manuscript, corpus, masters
|
| 81 |
|
| 82 |
|
| 83 |
+
def _read_master_json_sync(corpus_slug: str, page_id: str) -> PageMaster | None:
|
| 84 |
+
"""Lit le master.json d'une page depuis data/. Retourne None si absent (bloquant)."""
|
| 85 |
path = (
|
| 86 |
_config_module.settings.data_dir
|
| 87 |
/ "corpora"
|
|
|
|
| 96 |
return PageMaster.model_validate(raw)
|
| 97 |
|
| 98 |
|
| 99 |
+
async def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
|
| 100 |
+
"""Version async β dΓ©lΓ¨gue la lecture au threadpool."""
|
| 101 |
+
return await asyncio.to_thread(_read_master_json_sync, corpus_slug, page_id)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
def _build_manuscript_meta(
|
| 105 |
manuscript: ManuscriptModel, corpus: CorpusModel
|
| 106 |
) -> dict:
|
|
|
|
| 160 |
manuscript = await db.get(ManuscriptModel, page.manuscript_id)
|
| 161 |
corpus = await db.get(CorpusModel, manuscript.corpus_id)
|
| 162 |
|
| 163 |
+
master = await _read_master_json(corpus.slug, page_id)
|
| 164 |
if master is None:
|
| 165 |
raise HTTPException(
|
| 166 |
status_code=404,
|
backend/app/api/v1/ingest.py
CHANGED
|
@@ -11,13 +11,14 @@ Règle : ingestion = création des PageModel en BDD uniquement.
|
|
| 11 |
"""
|
| 12 |
# 1. stdlib
|
| 13 |
import logging
|
|
|
|
| 14 |
import uuid
|
| 15 |
from pathlib import Path
|
| 16 |
|
| 17 |
# 2. third-party
|
| 18 |
import httpx
|
| 19 |
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
|
| 20 |
-
from pydantic import BaseModel
|
| 21 |
from sqlalchemy import func, select
|
| 22 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 23 |
|
|
@@ -30,6 +31,28 @@ logger = logging.getLogger(__name__)
|
|
| 30 |
|
| 31 |
router = APIRouter(tags=["ingestion"])
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# ββ SchΓ©mas βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
|
|
@@ -38,8 +61,8 @@ class IIIFManifestRequest(BaseModel):
|
|
| 38 |
|
| 39 |
|
| 40 |
class IIIFImagesRequest(BaseModel):
|
| 41 |
-
urls: list[str]
|
| 42 |
-
folio_labels: list[str]
|
| 43 |
|
| 44 |
|
| 45 |
class IngestResponse(BaseModel):
|
|
@@ -144,11 +167,31 @@ _MANIFEST_HEADERS = {
|
|
| 144 |
}
|
| 145 |
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
async def _fetch_json_manifest(url: str) -> dict:
|
| 148 |
-
"""TΓ©lΓ©charge un manifest IIIF
|
|
|
|
| 149 |
async with httpx.AsyncClient() as client:
|
| 150 |
resp = await client.get(url, headers=_MANIFEST_HEADERS, follow_redirects=True, timeout=30.0)
|
| 151 |
resp.raise_for_status()
|
|
|
|
|
|
|
| 152 |
return resp.json()
|
| 153 |
|
| 154 |
|
|
@@ -202,16 +245,33 @@ async def ingest_files(
|
|
| 202 |
seq = await _next_sequence(db, ms.id)
|
| 203 |
|
| 204 |
# Collect labels and detect duplicates
|
| 205 |
-
labels = [Path(f.filename or f"file_{i}").stem for i, f in enumerate(files)]
|
| 206 |
dupes = _find_duplicate_labels(labels)
|
| 207 |
|
| 208 |
created: list[PageModel] = []
|
|
|
|
| 209 |
skipped = 0
|
| 210 |
for i, upload in enumerate(files):
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
folio_label = labels[i]
|
| 213 |
page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
master_dir = (
|
| 216 |
_config_module.settings.data_dir
|
| 217 |
/ "corpora"
|
|
@@ -221,8 +281,8 @@ async def ingest_files(
|
|
| 221 |
)
|
| 222 |
master_dir.mkdir(parents=True, exist_ok=True)
|
| 223 |
master_path = master_dir / filename
|
| 224 |
-
content = await upload.read()
|
| 225 |
master_path.write_bytes(content)
|
|
|
|
| 226 |
|
| 227 |
page = await _create_page(
|
| 228 |
db, ms.id, page_id, folio_label, seq + i,
|
|
@@ -234,7 +294,13 @@ async def ingest_files(
|
|
| 234 |
created.append(page)
|
| 235 |
|
| 236 |
ms.total_pages = (ms.total_pages or 0) + len(created)
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
logger.info(
|
| 240 |
"Fichiers ingΓ©rΓ©s",
|
|
@@ -260,6 +326,8 @@ async def ingest_iiif_manifest(
|
|
| 260 |
|
| 261 |
try:
|
| 262 |
manifest = await _fetch_json_manifest(body.manifest_url)
|
|
|
|
|
|
|
| 263 |
except httpx.HTTPStatusError as exc:
|
| 264 |
raise HTTPException(
|
| 265 |
status_code=502,
|
|
@@ -302,7 +370,7 @@ async def ingest_iiif_manifest(
|
|
| 302 |
seq = await _next_sequence(db, ms.id)
|
| 303 |
|
| 304 |
# Collect labels and detect duplicates
|
| 305 |
-
labels = [_extract_canvas_label(canvas, i) for i, canvas in enumerate(canvases)]
|
| 306 |
dupes = _find_duplicate_labels(labels)
|
| 307 |
|
| 308 |
created: list[PageModel] = []
|
|
@@ -358,11 +426,12 @@ async def ingest_iiif_images(
|
|
| 358 |
ms = await _get_or_create_manuscript(db, corpus_id)
|
| 359 |
seq = await _next_sequence(db, ms.id)
|
| 360 |
|
| 361 |
-
|
|
|
|
| 362 |
|
| 363 |
created: list[PageModel] = []
|
| 364 |
skipped = 0
|
| 365 |
-
for i, (url, folio_label) in enumerate(zip(body.urls,
|
| 366 |
page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
|
| 367 |
page = await _create_page(
|
| 368 |
db, ms.id, page_id, folio_label, seq + i,
|
|
|
|
| 11 |
"""
|
| 12 |
# 1. stdlib
|
| 13 |
import logging
|
| 14 |
+
import re
|
| 15 |
import uuid
|
| 16 |
from pathlib import Path
|
| 17 |
|
| 18 |
# 2. third-party
|
| 19 |
import httpx
|
| 20 |
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
|
| 21 |
+
from pydantic import BaseModel, Field
|
| 22 |
from sqlalchemy import func, select
|
| 23 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 24 |
|
|
|
|
| 31 |
|
| 32 |
router = APIRouter(tags=["ingestion"])
|
| 33 |
|
| 34 |
+
# ββ Constantes de sΓ©curitΓ© ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
|
| 36 |
+
_SAFE_LABEL_RE = re.compile(r"^[\w\-\.]+$")
|
| 37 |
+
_MAX_UPLOAD_BYTES = 100 * 1024 * 1024 # 100 Mo par fichier
|
| 38 |
+
_ALLOWED_MIME_PREFIXES = ("image/",)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _sanitize_label(label: str) -> str:
|
| 42 |
+
"""Nettoie un folio_label : garde uniquement alphanum, -, _, ."""
|
| 43 |
+
clean = Path(label).name # retire tout chemin
|
| 44 |
+
if not _SAFE_LABEL_RE.match(clean) or not clean:
|
| 45 |
+
clean = re.sub(r"[^\w\-\.]", "_", clean) or "page"
|
| 46 |
+
return clean
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _sanitize_filename(name: str) -> str:
|
| 50 |
+
"""Nettoie un nom de fichier uploadΓ© : garde uniquement le basename sΓ»r."""
|
| 51 |
+
clean = Path(name).name
|
| 52 |
+
if not _SAFE_LABEL_RE.match(clean) or not clean:
|
| 53 |
+
clean = f"{uuid.uuid4().hex[:12]}.bin"
|
| 54 |
+
return clean
|
| 55 |
+
|
| 56 |
|
| 57 |
# ββ SchΓ©mas βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 58 |
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
class IIIFImagesRequest(BaseModel):
|
| 64 |
+
urls: list[str] = Field(..., max_length=5000)
|
| 65 |
+
folio_labels: list[str] = Field(..., max_length=5000)
|
| 66 |
|
| 67 |
|
| 68 |
class IngestResponse(BaseModel):
|
|
|
|
| 167 |
}
|
| 168 |
|
| 169 |
|
| 170 |
+
_MAX_MANIFEST_BYTES = 10 * 1024 * 1024 # 10 Mo max pour un manifest JSON
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def _validate_url(url: str) -> None:
|
| 174 |
+
"""Rejette les URLs non-HTTP et les cibles rΓ©seau privΓ© (SSRF)."""
|
| 175 |
+
from urllib.parse import urlparse
|
| 176 |
+
|
| 177 |
+
parsed = urlparse(url)
|
| 178 |
+
if parsed.scheme not in ("http", "https"):
|
| 179 |
+
raise ValueError(f"SchΓ©ma non autorisΓ© : {parsed.scheme!r}")
|
| 180 |
+
host = (parsed.hostname or "").lower()
|
| 181 |
+
# Bloquer les adresses privΓ©es / locales
|
| 182 |
+
blocked = ("localhost", "127.0.0.1", "0.0.0.0", "[::1]", "metadata.google.internal")
|
| 183 |
+
if host in blocked or host.startswith("169.254.") or host.startswith("10.") or host.startswith("192.168."):
|
| 184 |
+
raise ValueError(f"HΓ΄te interdit : {host}")
|
| 185 |
+
|
| 186 |
+
|
| 187 |
async def _fetch_json_manifest(url: str) -> dict:
|
| 188 |
+
"""TΓ©lΓ©charge un manifest IIIF avec protections SSRF + taille max."""
|
| 189 |
+
_validate_url(url)
|
| 190 |
async with httpx.AsyncClient() as client:
|
| 191 |
resp = await client.get(url, headers=_MANIFEST_HEADERS, follow_redirects=True, timeout=30.0)
|
| 192 |
resp.raise_for_status()
|
| 193 |
+
if len(resp.content) > _MAX_MANIFEST_BYTES:
|
| 194 |
+
raise ValueError(f"Manifest trop volumineux ({len(resp.content)} octets)")
|
| 195 |
return resp.json()
|
| 196 |
|
| 197 |
|
|
|
|
| 245 |
seq = await _next_sequence(db, ms.id)
|
| 246 |
|
| 247 |
# Collect labels and detect duplicates
|
| 248 |
+
labels = [_sanitize_label(Path(f.filename or f"file_{i}").stem) for i, f in enumerate(files)]
|
| 249 |
dupes = _find_duplicate_labels(labels)
|
| 250 |
|
| 251 |
created: list[PageModel] = []
|
| 252 |
+
written_files: list[Path] = []
|
| 253 |
skipped = 0
|
| 254 |
for i, upload in enumerate(files):
|
| 255 |
+
# Validation MIME type
|
| 256 |
+
ctype = upload.content_type or ""
|
| 257 |
+
if not any(ctype.startswith(p) for p in _ALLOWED_MIME_PREFIXES):
|
| 258 |
+
raise HTTPException(
|
| 259 |
+
status_code=422,
|
| 260 |
+
detail=f"Type MIME non autorisΓ© : {ctype!r}. Seules les images sont acceptΓ©es.",
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
filename = _sanitize_filename(upload.filename or f"file_{i}.bin")
|
| 264 |
folio_label = labels[i]
|
| 265 |
page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
|
| 266 |
|
| 267 |
+
content = await upload.read()
|
| 268 |
+
# Validation taille
|
| 269 |
+
if len(content) > _MAX_UPLOAD_BYTES:
|
| 270 |
+
raise HTTPException(
|
| 271 |
+
status_code=413,
|
| 272 |
+
detail=f"Fichier trop volumineux ({len(content)} octets). Maximum : {_MAX_UPLOAD_BYTES}.",
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
master_dir = (
|
| 276 |
_config_module.settings.data_dir
|
| 277 |
/ "corpora"
|
|
|
|
| 281 |
)
|
| 282 |
master_dir.mkdir(parents=True, exist_ok=True)
|
| 283 |
master_path = master_dir / filename
|
|
|
|
| 284 |
master_path.write_bytes(content)
|
| 285 |
+
written_files.append(master_path)
|
| 286 |
|
| 287 |
page = await _create_page(
|
| 288 |
db, ms.id, page_id, folio_label, seq + i,
|
|
|
|
| 294 |
created.append(page)
|
| 295 |
|
| 296 |
ms.total_pages = (ms.total_pages or 0) + len(created)
|
| 297 |
+
try:
|
| 298 |
+
await db.commit()
|
| 299 |
+
except Exception:
|
| 300 |
+
# Nettoyage des fichiers orphelins si le commit BDD Γ©choue
|
| 301 |
+
for f in written_files:
|
| 302 |
+
f.unlink(missing_ok=True)
|
| 303 |
+
raise
|
| 304 |
|
| 305 |
logger.info(
|
| 306 |
"Fichiers ingΓ©rΓ©s",
|
|
|
|
| 326 |
|
| 327 |
try:
|
| 328 |
manifest = await _fetch_json_manifest(body.manifest_url)
|
| 329 |
+
except ValueError as exc:
|
| 330 |
+
raise HTTPException(status_code=400, detail=str(exc))
|
| 331 |
except httpx.HTTPStatusError as exc:
|
| 332 |
raise HTTPException(
|
| 333 |
status_code=502,
|
|
|
|
| 370 |
seq = await _next_sequence(db, ms.id)
|
| 371 |
|
| 372 |
# Collect labels and detect duplicates
|
| 373 |
+
labels = [_sanitize_label(_extract_canvas_label(canvas, i)) for i, canvas in enumerate(canvases)]
|
| 374 |
dupes = _find_duplicate_labels(labels)
|
| 375 |
|
| 376 |
created: list[PageModel] = []
|
|
|
|
| 426 |
ms = await _get_or_create_manuscript(db, corpus_id)
|
| 427 |
seq = await _next_sequence(db, ms.id)
|
| 428 |
|
| 429 |
+
sanitized_labels = [_sanitize_label(lbl) for lbl in body.folio_labels]
|
| 430 |
+
dupes = _find_duplicate_labels(sanitized_labels)
|
| 431 |
|
| 432 |
created: list[PageModel] = []
|
| 433 |
skipped = 0
|
| 434 |
+
for i, (url, folio_label) in enumerate(zip(body.urls, sanitized_labels)):
|
| 435 |
page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
|
| 436 |
page = await _create_page(
|
| 437 |
db, ms.id, page_id, folio_label, seq + i,
|
backend/app/api/v1/jobs.py
CHANGED
|
@@ -22,8 +22,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
| 22 |
from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
|
| 23 |
from app.models.database import get_db
|
| 24 |
from app.models.job import JobModel
|
| 25 |
-
from app.services.corpus_runner import execute_corpus_job
|
| 26 |
-
from app.services.job_runner import execute_page_job
|
| 27 |
|
| 28 |
router = APIRouter(tags=["jobs"])
|
| 29 |
|
|
@@ -101,6 +99,8 @@ async def run_corpus(
|
|
| 101 |
await db.commit()
|
| 102 |
|
| 103 |
# Lancer le pipeline en arrière-plan (après envoi de la réponse)
|
|
|
|
|
|
|
| 104 |
background_tasks.add_task(execute_corpus_job, corpus_id)
|
| 105 |
|
| 106 |
return CorpusRunResponse(
|
|
@@ -135,6 +135,8 @@ async def run_page(
|
|
| 135 |
await db.refresh(job)
|
| 136 |
|
| 137 |
# Lancer le pipeline en arrière-plan (après envoi de la réponse)
|
|
|
|
|
|
|
| 138 |
background_tasks.add_task(execute_page_job, job.id)
|
| 139 |
|
| 140 |
return job
|
|
@@ -175,6 +177,8 @@ async def retry_job(
|
|
| 175 |
await db.refresh(job)
|
| 176 |
|
| 177 |
# Relancer le pipeline
|
|
|
|
|
|
|
| 178 |
background_tasks.add_task(execute_page_job, job.id)
|
| 179 |
|
| 180 |
return job
|
|
|
|
| 22 |
from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
|
| 23 |
from app.models.database import get_db
|
| 24 |
from app.models.job import JobModel
|
|
|
|
|
|
|
| 25 |
|
| 26 |
router = APIRouter(tags=["jobs"])
|
| 27 |
|
|
|
|
| 99 |
await db.commit()
|
| 100 |
|
| 101 |
# Lancer le pipeline en arrière-plan (après envoi de la réponse)
|
| 102 |
+
from app.services.corpus_runner import execute_corpus_job
|
| 103 |
+
|
| 104 |
background_tasks.add_task(execute_corpus_job, corpus_id)
|
| 105 |
|
| 106 |
return CorpusRunResponse(
|
|
|
|
| 135 |
await db.refresh(job)
|
| 136 |
|
| 137 |
# Lancer le pipeline en arrière-plan (après envoi de la réponse)
|
| 138 |
+
from app.services.job_runner import execute_page_job
|
| 139 |
+
|
| 140 |
background_tasks.add_task(execute_page_job, job.id)
|
| 141 |
|
| 142 |
return job
|
|
|
|
| 177 |
await db.refresh(job)
|
| 178 |
|
| 179 |
# Relancer le pipeline
|
| 180 |
+
from app.services.job_runner import execute_page_job
|
| 181 |
+
|
| 182 |
background_tasks.add_task(execute_page_job, job.id)
|
| 183 |
|
| 184 |
return job
|
backend/app/api/v1/models_api.py
CHANGED
|
@@ -17,7 +17,7 @@ from datetime import datetime, timezone
|
|
| 17 |
|
| 18 |
# 2. third-party
|
| 19 |
from fastapi import APIRouter, Depends, HTTPException
|
| 20 |
-
from pydantic import BaseModel, ConfigDict
|
| 21 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 22 |
|
| 23 |
# 3. local
|
|
@@ -25,11 +25,6 @@ from app.models.corpus import CorpusModel
|
|
| 25 |
from app.models.database import get_db
|
| 26 |
from app.models.model_config_db import ModelConfigDB
|
| 27 |
from app.schemas.model_config import ProviderType
|
| 28 |
-
from app.services.ai.model_registry import (
|
| 29 |
-
get_available_providers,
|
| 30 |
-
list_all_models,
|
| 31 |
-
list_models_for_provider,
|
| 32 |
-
)
|
| 33 |
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
|
@@ -47,9 +42,9 @@ class ProviderInfo(BaseModel):
|
|
| 47 |
|
| 48 |
|
| 49 |
class ModelSelectRequest(BaseModel):
|
| 50 |
-
model_id: str
|
| 51 |
-
provider_type: str
|
| 52 |
-
display_name: str = ""
|
| 53 |
|
| 54 |
|
| 55 |
class ModelConfigResponse(BaseModel):
|
|
@@ -77,6 +72,8 @@ async def list_providers() -> list[dict]:
|
|
| 77 |
Un provider est disponible si la variable d'environnement correspondante
|
| 78 |
est prΓ©sente dans les secrets HuggingFace. Aucune clΓ© n'est exposΓ©e.
|
| 79 |
"""
|
|
|
|
|
|
|
| 80 |
return get_available_providers()
|
| 81 |
|
| 82 |
|
|
@@ -91,6 +88,8 @@ async def get_provider_models(provider_type: str) -> list[dict]:
|
|
| 91 |
detail=f"Provider inconnu : {provider_type}. "
|
| 92 |
f"Valeurs acceptΓ©es : {[p.value for p in ProviderType]}",
|
| 93 |
)
|
|
|
|
|
|
|
| 94 |
try:
|
| 95 |
models = list_models_for_provider(ptype)
|
| 96 |
except RuntimeError as exc:
|
|
@@ -104,6 +103,8 @@ async def get_provider_models(provider_type: str) -> list[dict]:
|
|
| 104 |
@router.post("/models/refresh", response_model=ModelsRefreshResponse)
|
| 105 |
async def refresh_models() -> ModelsRefreshResponse:
|
| 106 |
"""Force la mise à jour de la liste agrégée de tous les modèles disponibles."""
|
|
|
|
|
|
|
| 107 |
models = list_all_models()
|
| 108 |
return ModelsRefreshResponse(
|
| 109 |
models=[m.model_dump() for m in models],
|
|
|
|
| 17 |
|
| 18 |
# 2. third-party
|
| 19 |
from fastapi import APIRouter, Depends, HTTPException
|
| 20 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 21 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 22 |
|
| 23 |
# 3. local
|
|
|
|
| 25 |
from app.models.database import get_db
|
| 26 |
from app.models.model_config_db import ModelConfigDB
|
| 27 |
from app.schemas.model_config import ProviderType
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
logger = logging.getLogger(__name__)
|
| 30 |
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
class ModelSelectRequest(BaseModel):
|
| 45 |
+
model_id: str = Field(..., min_length=1, max_length=256)
|
| 46 |
+
provider_type: str = Field(..., min_length=1, max_length=64)
|
| 47 |
+
display_name: str = Field("", max_length=256)
|
| 48 |
|
| 49 |
|
| 50 |
class ModelConfigResponse(BaseModel):
|
|
|
|
| 72 |
Un provider est disponible si la variable d'environnement correspondante
|
| 73 |
est prΓ©sente dans les secrets HuggingFace. Aucune clΓ© n'est exposΓ©e.
|
| 74 |
"""
|
| 75 |
+
from app.services.ai.model_registry import get_available_providers
|
| 76 |
+
|
| 77 |
return get_available_providers()
|
| 78 |
|
| 79 |
|
|
|
|
| 88 |
detail=f"Provider inconnu : {provider_type}. "
|
| 89 |
f"Valeurs acceptΓ©es : {[p.value for p in ProviderType]}",
|
| 90 |
)
|
| 91 |
+
from app.services.ai.model_registry import list_models_for_provider
|
| 92 |
+
|
| 93 |
try:
|
| 94 |
models = list_models_for_provider(ptype)
|
| 95 |
except RuntimeError as exc:
|
|
|
|
| 103 |
@router.post("/models/refresh", response_model=ModelsRefreshResponse)
|
| 104 |
async def refresh_models() -> ModelsRefreshResponse:
|
| 105 |
"""Force la mise à jour de la liste agrégée de tous les modèles disponibles."""
|
| 106 |
+
from app.services.ai.model_registry import list_all_models
|
| 107 |
+
|
| 108 |
models = list_all_models()
|
| 109 |
return ModelsRefreshResponse(
|
| 110 |
models=[m.model_dump() for m in models],
|
backend/app/api/v1/pages.py
CHANGED
|
@@ -18,7 +18,7 @@ from typing import Any
|
|
| 18 |
|
| 19 |
# 2. third-party
|
| 20 |
from fastapi import APIRouter, Depends, HTTPException
|
| 21 |
-
from pydantic import BaseModel, ConfigDict, ValidationError
|
| 22 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 23 |
|
| 24 |
# 3. local
|
|
@@ -42,12 +42,12 @@ class CorrectionsRequest(BaseModel):
|
|
| 42 |
indiquΓ©e est restaurΓ©e (avec incrΓ©mentation de editorial.version).
|
| 43 |
"""
|
| 44 |
|
| 45 |
-
ocr_diplomatic_text: str | None = None
|
| 46 |
-
editorial_status: str | None = None
|
| 47 |
-
commentary_public: str | None = None
|
| 48 |
-
commentary_scholarly: str | None = None
|
| 49 |
region_validations: dict[str, str] | None = None
|
| 50 |
-
restore_to_version: int | None = None
|
| 51 |
|
| 52 |
|
| 53 |
class VersionInfo(BaseModel):
|
|
|
|
| 18 |
|
| 19 |
# 2. third-party
|
| 20 |
from fastapi import APIRouter, Depends, HTTPException
|
| 21 |
+
from pydantic import BaseModel, ConfigDict, Field, ValidationError
|
| 22 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 23 |
|
| 24 |
# 3. local
|
|
|
|
| 42 |
indiquΓ©e est restaurΓ©e (avec incrΓ©mentation de editorial.version).
|
| 43 |
"""
|
| 44 |
|
| 45 |
+
ocr_diplomatic_text: str | None = Field(None, max_length=500_000)
|
| 46 |
+
editorial_status: str | None = Field(None, max_length=50)
|
| 47 |
+
commentary_public: str | None = Field(None, max_length=100_000)
|
| 48 |
+
commentary_scholarly: str | None = Field(None, max_length=100_000)
|
| 49 |
region_validations: dict[str, str] | None = None
|
| 50 |
+
restore_to_version: int | None = Field(None, ge=1)
|
| 51 |
|
| 52 |
|
| 53 |
class VersionInfo(BaseModel):
|
backend/app/api/v1/profiles.py
CHANGED
|
@@ -8,8 +8,10 @@ Les profils sont des fichiers JSON dans profiles/ (racine du dΓ©pΓ΄t).
|
|
| 8 |
Ils sont validΓ©s par CorpusProfile avant d'Γͺtre retournΓ©s.
|
| 9 |
"""
|
| 10 |
# 1. stdlib
|
|
|
|
| 11 |
import json
|
| 12 |
import logging
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
|
| 15 |
# 2. third-party
|
|
@@ -49,21 +51,36 @@ async def list_profiles() -> list[dict]:
|
|
| 49 |
if not settings.profiles_dir.is_dir():
|
| 50 |
logger.warning("profiles_dir introuvable : %s", settings.profiles_dir)
|
| 51 |
return []
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
@router.get("/{profile_id}", response_model=dict)
|
| 61 |
async def get_profile(profile_id: str) -> dict:
|
| 62 |
"""Retourne un profil par son id (nom du fichier sans extension)."""
|
|
|
|
|
|
|
| 63 |
path = settings.profiles_dir / f"{profile_id}.json"
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
raise HTTPException(status_code=404, detail="Profil introuvable")
|
| 66 |
-
profile = _load_profile(path)
|
| 67 |
if profile is None:
|
| 68 |
raise HTTPException(status_code=422, detail="Profil invalide")
|
| 69 |
return profile.model_dump()
|
|
|
|
| 8 |
Ils sont validΓ©s par CorpusProfile avant d'Γͺtre retournΓ©s.
|
| 9 |
"""
|
| 10 |
# 1. stdlib
|
| 11 |
+
import asyncio
|
| 12 |
import json
|
| 13 |
import logging
|
| 14 |
+
import re
|
| 15 |
from pathlib import Path
|
| 16 |
|
| 17 |
# 2. third-party
|
|
|
|
| 51 |
if not settings.profiles_dir.is_dir():
|
| 52 |
logger.warning("profiles_dir introuvable : %s", settings.profiles_dir)
|
| 53 |
return []
|
| 54 |
+
|
| 55 |
+
def _scan_profiles() -> list[dict]:
|
| 56 |
+
result = []
|
| 57 |
+
for path in sorted(settings.profiles_dir.glob("*.json")):
|
| 58 |
+
profile = _load_profile(path)
|
| 59 |
+
if profile is not None:
|
| 60 |
+
result.append(profile.model_dump())
|
| 61 |
+
return result
|
| 62 |
+
|
| 63 |
+
return await asyncio.to_thread(_scan_profiles)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
_SAFE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]*$")
|
| 67 |
|
| 68 |
|
| 69 |
@router.get("/{profile_id}", response_model=dict)
|
| 70 |
async def get_profile(profile_id: str) -> dict:
|
| 71 |
"""Retourne un profil par son id (nom du fichier sans extension)."""
|
| 72 |
+
if not _SAFE_ID_RE.match(profile_id):
|
| 73 |
+
raise HTTPException(status_code=400, detail="profile_id invalide")
|
| 74 |
path = settings.profiles_dir / f"{profile_id}.json"
|
| 75 |
+
|
| 76 |
+
def _read() -> CorpusProfile | None:
|
| 77 |
+
if not path.exists():
|
| 78 |
+
return None
|
| 79 |
+
return _load_profile(path)
|
| 80 |
+
|
| 81 |
+
profile = await asyncio.to_thread(_read)
|
| 82 |
+
if profile is None and not path.exists():
|
| 83 |
raise HTTPException(status_code=404, detail="Profil introuvable")
|
|
|
|
| 84 |
if profile is None:
|
| 85 |
raise HTTPException(status_code=422, detail="Profil invalide")
|
| 86 |
return profile.model_dump()
|
backend/app/api/v1/search.py
CHANGED
|
@@ -7,6 +7,7 @@ ImplΓ©mentation MVP : scan des fichiers master.json (pas d'index externe).
|
|
| 7 |
Insensible Γ la casse et aux accents (unicodedata NFD + ASCII).
|
| 8 |
"""
|
| 9 |
# 1. stdlib
|
|
|
|
| 10 |
import json
|
| 11 |
import logging
|
| 12 |
import unicodedata
|
|
@@ -95,7 +96,8 @@ def _score_master(data: dict, query_normalized: str) -> tuple[int, str]:
|
|
| 95 |
|
| 96 |
@router.get("/search", response_model=list[SearchResult])
|
| 97 |
async def search_pages(
|
| 98 |
-
q: str = Query(..., min_length=2, description="RequΓͺte de recherche (
|
|
|
|
| 99 |
) -> list[SearchResult]:
|
| 100 |
"""Recherche plein texte dans les master.json de tous les corpus.
|
| 101 |
|
|
@@ -106,29 +108,32 @@ async def search_pages(
|
|
| 106 |
query_normalized = _normalize(q.strip())
|
| 107 |
data_dir = _config_module.settings.data_dir
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
|
|
|
|
|
|
| 129 |
)
|
| 130 |
-
)
|
|
|
|
| 131 |
|
| 132 |
-
results
|
| 133 |
logger.info("Recherche exΓ©cutΓ©e", extra={"q": q, "results": len(results)})
|
| 134 |
-
return results
|
|
|
|
| 7 |
Insensible Γ la casse et aux accents (unicodedata NFD + ASCII).
|
| 8 |
"""
|
| 9 |
# 1. stdlib
|
| 10 |
+
import asyncio
|
| 11 |
import json
|
| 12 |
import logging
|
| 13 |
import unicodedata
|
|
|
|
| 96 |
|
| 97 |
@router.get("/search", response_model=list[SearchResult])
|
| 98 |
async def search_pages(
|
| 99 |
+
q: str = Query(..., min_length=2, max_length=500, description="RequΓͺte de recherche (2β500 caractΓ¨res)"),
|
| 100 |
+
limit: int = Query(200, ge=1, le=2000, description="Nombre maximum de rΓ©sultats"),
|
| 101 |
) -> list[SearchResult]:
|
| 102 |
"""Recherche plein texte dans les master.json de tous les corpus.
|
| 103 |
|
|
|
|
| 108 |
query_normalized = _normalize(q.strip())
|
| 109 |
data_dir = _config_module.settings.data_dir
|
| 110 |
|
| 111 |
+
def _scan() -> list[SearchResult]:
|
| 112 |
+
"""Scan bloquant exΓ©cutΓ© dans un thread dΓ©diΓ©."""
|
| 113 |
+
hits: list[SearchResult] = []
|
| 114 |
+
for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
|
| 115 |
+
try:
|
| 116 |
+
raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
|
| 117 |
+
except (json.JSONDecodeError, OSError):
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
score, excerpt = _score_master(raw, query_normalized)
|
| 121 |
+
if score == 0:
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
hits.append(
|
| 125 |
+
SearchResult(
|
| 126 |
+
page_id=raw.get("page_id", ""),
|
| 127 |
+
folio_label=raw.get("folio_label", ""),
|
| 128 |
+
manuscript_id=raw.get("manuscript_id", ""),
|
| 129 |
+
excerpt=excerpt,
|
| 130 |
+
score=score,
|
| 131 |
+
corpus_profile=raw.get("corpus_profile", ""),
|
| 132 |
+
)
|
| 133 |
)
|
| 134 |
+
hits.sort(key=lambda r: r.score, reverse=True)
|
| 135 |
+
return hits
|
| 136 |
|
| 137 |
+
results = await asyncio.to_thread(_scan)
|
| 138 |
logger.info("Recherche exΓ©cutΓ©e", extra={"q": q, "results": len(results)})
|
| 139 |
+
return results[:limit]
|
backend/app/config.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
"""
|
| 2 |
Configuration globale de la plateforme, chargΓ©e depuis les variables d'environnement.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
- les valeurs sont lues depuis os.environ au moment de l'instanciation
|
| 6 |
- l'objet `settings` est importΓ© partout dans l'application
|
| 7 |
- dans les tests : monkeypatch.setattr(config, "settings", ...) pour surcharger
|
| 8 |
"""
|
| 9 |
# 1. stdlib
|
| 10 |
-
import os
|
| 11 |
from pathlib import Path
|
| 12 |
|
| 13 |
# 2. third-party
|
| 14 |
-
from pydantic import
|
|
|
|
| 15 |
|
| 16 |
# Racine du dΓ©pΓ΄t β rΓ©solue depuis l'emplacement absolu de ce fichier.
|
| 17 |
# config.py se trouve dans backend/app/ ; 3 parents remontent Γ la racine.
|
|
@@ -19,14 +19,17 @@ from pydantic import BaseModel, ConfigDict
|
|
| 19 |
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
| 20 |
|
| 21 |
|
| 22 |
-
class Settings(
|
| 23 |
"""Paramètres d'application lus depuis les variables d'environnement.
|
| 24 |
|
| 25 |
Toutes les clΓ©s API sont optionnelles (None si non configurΓ©es).
|
| 26 |
Elles ne sont jamais loguΓ©es ni exportΓ©es (R06).
|
| 27 |
"""
|
| 28 |
|
| 29 |
-
model_config = ConfigDict(
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# ββ Serveur ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
base_url: str = "http://localhost:8000"
|
|
@@ -50,21 +53,4 @@ class Settings(BaseModel):
|
|
| 50 |
mistral_api_key: str | None = None
|
| 51 |
|
| 52 |
|
| 53 |
-
|
| 54 |
-
"""Lit les variables d'environnement et construit l'objet Settings."""
|
| 55 |
-
return Settings(
|
| 56 |
-
base_url=os.getenv("BASE_URL", "http://localhost:8000"),
|
| 57 |
-
data_dir=Path(os.getenv("DATA_DIR", "data")),
|
| 58 |
-
profiles_dir=Path(os.getenv("PROFILES_DIR", str(_REPO_ROOT / "profiles"))),
|
| 59 |
-
prompts_dir=Path(os.getenv("PROMPTS_DIR", str(_REPO_ROOT / "prompts"))),
|
| 60 |
-
database_url=os.getenv(
|
| 61 |
-
"DATABASE_URL", "sqlite+aiosqlite:///./scriptorium.db"
|
| 62 |
-
),
|
| 63 |
-
google_ai_studio_api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY"),
|
| 64 |
-
vertex_api_key=os.getenv("VERTEX_API_KEY"),
|
| 65 |
-
vertex_service_account_json=os.getenv("VERTEX_SERVICE_ACCOUNT_JSON"),
|
| 66 |
-
mistral_api_key=os.getenv("MISTRAL_API_KEY"),
|
| 67 |
-
)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
settings: Settings = _load_settings()
|
|
|
|
| 1 |
"""
|
| 2 |
Configuration globale de la plateforme, chargΓ©e depuis les variables d'environnement.
|
| 3 |
|
| 4 |
+
Utilise pydantic-settings (CLAUDE.md Β§2, Β§7) :
|
| 5 |
+
- les valeurs sont lues depuis os.environ / fichier .env au moment de l'instanciation
|
| 6 |
- l'objet `settings` est importΓ© partout dans l'application
|
| 7 |
- dans les tests : monkeypatch.setattr(config, "settings", ...) pour surcharger
|
| 8 |
"""
|
| 9 |
# 1. stdlib
|
|
|
|
| 10 |
from pathlib import Path
|
| 11 |
|
| 12 |
# 2. third-party
|
| 13 |
+
from pydantic import ConfigDict
|
| 14 |
+
from pydantic_settings import BaseSettings
|
| 15 |
|
| 16 |
# Racine du dΓ©pΓ΄t β rΓ©solue depuis l'emplacement absolu de ce fichier.
|
| 17 |
# config.py se trouve dans backend/app/ ; 3 parents remontent Γ la racine.
|
|
|
|
| 19 |
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
| 20 |
|
| 21 |
|
| 22 |
+
class Settings(BaseSettings):
|
| 23 |
"""Paramètres d'application lus depuis les variables d'environnement.
|
| 24 |
|
| 25 |
Toutes les clΓ©s API sont optionnelles (None si non configurΓ©es).
|
| 26 |
Elles ne sont jamais loguΓ©es ni exportΓ©es (R06).
|
| 27 |
"""
|
| 28 |
|
| 29 |
+
model_config = ConfigDict(
|
| 30 |
+
env_file=".env",
|
| 31 |
+
extra="ignore",
|
| 32 |
+
)
|
| 33 |
|
| 34 |
# ββ Serveur ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
base_url: str = "http://localhost:8000"
|
|
|
|
| 53 |
mistral_api_key: str | None = None
|
| 54 |
|
| 55 |
|
| 56 |
+
settings: Settings = Settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/app/main.py
CHANGED
|
@@ -65,11 +65,11 @@ app = FastAPI(
|
|
| 65 |
lifespan=lifespan,
|
| 66 |
)
|
| 67 |
|
| 68 |
-
# ββ CORS (dev :
|
| 69 |
app.add_middleware(
|
| 70 |
CORSMiddleware,
|
| 71 |
allow_origins=["*"],
|
| 72 |
-
allow_credentials=
|
| 73 |
allow_methods=["*"],
|
| 74 |
allow_headers=["*"],
|
| 75 |
)
|
|
@@ -97,8 +97,9 @@ async def serve_frontend(full_path: str) -> FileResponse | RedirectResponse:
|
|
| 97 |
if full_path.startswith("api/"):
|
| 98 |
raise HTTPException(status_code=404, detail=f"Endpoint not found: /{full_path}")
|
| 99 |
if _STATIC_DIR.is_dir():
|
| 100 |
-
candidate = _STATIC_DIR / full_path
|
| 101 |
-
|
|
|
|
| 102 |
return FileResponse(candidate)
|
| 103 |
index = _STATIC_DIR / "index.html"
|
| 104 |
if index.exists():
|
|
|
|
| 65 |
lifespan=lifespan,
|
| 66 |
)
|
| 67 |
|
| 68 |
+
# ββ CORS (dev : toutes les origines autorisΓ©es, sans credentials) ββββββββββββββ
|
| 69 |
app.add_middleware(
|
| 70 |
CORSMiddleware,
|
| 71 |
allow_origins=["*"],
|
| 72 |
+
allow_credentials=False,
|
| 73 |
allow_methods=["*"],
|
| 74 |
allow_headers=["*"],
|
| 75 |
)
|
|
|
|
| 97 |
if full_path.startswith("api/"):
|
| 98 |
raise HTTPException(status_code=404, detail=f"Endpoint not found: /{full_path}")
|
| 99 |
if _STATIC_DIR.is_dir():
|
| 100 |
+
candidate = (_STATIC_DIR / full_path).resolve()
|
| 101 |
+
# EmpΓͺcher le path traversal : le fichier rΓ©solu doit Γͺtre sous _STATIC_DIR
|
| 102 |
+
if candidate.is_file() and str(candidate).startswith(str(_STATIC_DIR.resolve())):
|
| 103 |
return FileResponse(candidate)
|
| 104 |
index = _STATIC_DIR / "index.html"
|
| 105 |
if index.exists():
|
backend/app/models/corpus.py
CHANGED
|
@@ -6,6 +6,7 @@ Ils NE se substituent PAS aux schΓ©mas Pydantic (source canonique des types).
|
|
| 6 |
"""
|
| 7 |
# 1. stdlib
|
| 8 |
from datetime import datetime, timezone
|
|
|
|
| 9 |
|
| 10 |
# 2. third-party
|
| 11 |
from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, Text
|
|
@@ -24,8 +25,12 @@ class CorpusModel(Base):
|
|
| 24 |
slug: Mapped[str] = mapped_column(String, unique=True, nullable=False, index=True)
|
| 25 |
title: Mapped[str] = mapped_column(String, nullable=False)
|
| 26 |
profile_id: Mapped[str] = mapped_column(String, nullable=False)
|
| 27 |
-
created_at: Mapped[datetime] = mapped_column(
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
manuscripts: Mapped[list["ManuscriptModel"]] = relationship(
|
| 31 |
back_populates="corpus", cascade="all, delete-orphan"
|
|
|
|
| 6 |
"""
|
| 7 |
# 1. stdlib
|
| 8 |
from datetime import datetime, timezone
|
| 9 |
+
from functools import partial
|
| 10 |
|
| 11 |
# 2. third-party
|
| 12 |
from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, Text
|
|
|
|
| 25 |
slug: Mapped[str] = mapped_column(String, unique=True, nullable=False, index=True)
|
| 26 |
title: Mapped[str] = mapped_column(String, nullable=False)
|
| 27 |
profile_id: Mapped[str] = mapped_column(String, nullable=False)
|
| 28 |
+
created_at: Mapped[datetime] = mapped_column(
|
| 29 |
+
DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
|
| 30 |
+
)
|
| 31 |
+
updated_at: Mapped[datetime] = mapped_column(
|
| 32 |
+
DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
|
| 33 |
+
)
|
| 34 |
|
| 35 |
manuscripts: Mapped[list["ManuscriptModel"]] = relationship(
|
| 36 |
back_populates="corpus", cascade="all, delete-orphan"
|
backend/app/models/job.py
CHANGED
|
@@ -10,7 +10,8 @@ Cycle de vie :
|
|
| 10 |
β failed
|
| 11 |
"""
|
| 12 |
# 1. stdlib
|
| 13 |
-
from datetime import datetime
|
|
|
|
| 14 |
|
| 15 |
# 2. third-party
|
| 16 |
from sqlalchemy import DateTime, ForeignKey, String, Text
|
|
@@ -37,4 +38,6 @@ class JobModel(Base):
|
|
| 37 |
started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
| 38 |
finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
| 39 |
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
| 40 |
-
created_at: Mapped[datetime] = mapped_column(
|
|
|
|
|
|
|
|
|
| 10 |
β failed
|
| 11 |
"""
|
| 12 |
# 1. stdlib
|
| 13 |
+
from datetime import datetime, timezone
|
| 14 |
+
from functools import partial
|
| 15 |
|
| 16 |
# 2. third-party
|
| 17 |
from sqlalchemy import DateTime, ForeignKey, String, Text
|
|
|
|
| 38 |
started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
| 39 |
finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
|
| 40 |
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
| 41 |
+
created_at: Mapped[datetime] = mapped_column(
|
| 42 |
+
DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
|
| 43 |
+
)
|
backend/app/schemas/page_master.py
CHANGED
|
@@ -29,14 +29,25 @@ class Region(BaseModel):
|
|
| 29 |
|
| 30 |
@field_validator("bbox")
|
| 31 |
@classmethod
|
| 32 |
-
def
|
| 33 |
if any(x < 0 for x in v):
|
| 34 |
-
raise ValueError("bbox
|
| 35 |
if v[2] <= 0 or v[3] <= 0:
|
| 36 |
-
raise ValueError("bbox width
|
| 37 |
return v
|
| 38 |
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
class OCRResult(BaseModel):
|
| 41 |
diplomatic_text: str = ""
|
| 42 |
blocks: list[dict] = []
|
|
@@ -51,6 +62,13 @@ class Translation(BaseModel):
|
|
| 51 |
en: str = ""
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
class CommentaryClaim(BaseModel):
|
| 55 |
claim: str
|
| 56 |
evidence_region_ids: list[str] = []
|
|
@@ -64,6 +82,7 @@ class Commentary(BaseModel):
|
|
| 64 |
|
| 65 |
|
| 66 |
class ProcessingInfo(BaseModel):
|
|
|
|
| 67 |
model_id: str
|
| 68 |
model_display_name: str
|
| 69 |
prompt_version: str
|
|
@@ -96,11 +115,11 @@ class PageMaster(BaseModel):
|
|
| 96 |
folio_label: str
|
| 97 |
sequence: int
|
| 98 |
|
| 99 |
-
image:
|
| 100 |
layout: dict
|
| 101 |
ocr: OCRResult | None = None
|
| 102 |
translation: Translation | None = None
|
| 103 |
-
summary:
|
| 104 |
commentary: Commentary | None = None
|
| 105 |
extensions: dict[str, Any] = {}
|
| 106 |
|
|
|
|
| 29 |
|
| 30 |
@field_validator("bbox")
|
| 31 |
@classmethod
|
| 32 |
+
def bbox_must_be_valid(cls, v: list[int]) -> list[int]:
|
| 33 |
if any(x < 0 for x in v):
|
| 34 |
+
raise ValueError("bbox: toutes les valeurs doivent Γͺtre >= 0")
|
| 35 |
if v[2] <= 0 or v[3] <= 0:
|
| 36 |
+
raise ValueError("bbox: width et height doivent Γͺtre > 0")
|
| 37 |
return v
|
| 38 |
|
| 39 |
|
| 40 |
+
class ImageInfo(BaseModel):
|
| 41 |
+
"""MΓ©tadonnΓ©es image β CLAUDE.md Β§4.2."""
|
| 42 |
+
|
| 43 |
+
master: str
|
| 44 |
+
derivative_web: str | None = None
|
| 45 |
+
thumbnail: str | None = None
|
| 46 |
+
iiif_base: str | None = None
|
| 47 |
+
width: int
|
| 48 |
+
height: int
|
| 49 |
+
|
| 50 |
+
|
| 51 |
class OCRResult(BaseModel):
|
| 52 |
diplomatic_text: str = ""
|
| 53 |
blocks: list[dict] = []
|
|
|
|
| 62 |
en: str = ""
|
| 63 |
|
| 64 |
|
| 65 |
+
class Summary(BaseModel):
|
| 66 |
+
"""RΓ©sumΓ© β CLAUDE.md Β§4.2."""
|
| 67 |
+
|
| 68 |
+
short: str = ""
|
| 69 |
+
detailed: str = ""
|
| 70 |
+
|
| 71 |
+
|
| 72 |
class CommentaryClaim(BaseModel):
|
| 73 |
claim: str
|
| 74 |
evidence_region_ids: list[str] = []
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
class ProcessingInfo(BaseModel):
|
| 85 |
+
provider: str
|
| 86 |
model_id: str
|
| 87 |
model_display_name: str
|
| 88 |
prompt_version: str
|
|
|
|
| 115 |
folio_label: str
|
| 116 |
sequence: int
|
| 117 |
|
| 118 |
+
image: ImageInfo
|
| 119 |
layout: dict
|
| 120 |
ocr: OCRResult | None = None
|
| 121 |
translation: Translation | None = None
|
| 122 |
+
summary: Summary | None = None
|
| 123 |
commentary: Commentary | None = None
|
| 124 |
extensions: dict[str, Any] = {}
|
| 125 |
|
backend/app/services/ai/__init__.py
CHANGED
|
@@ -1,19 +1,31 @@
|
|
| 1 |
"""
|
| 2 |
Services AI β providers Google AI, registre de modΓ¨les, et analyse IA.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
"""
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
__all__ = [
|
| 14 |
-
"GoogleAIProvider",
|
| 15 |
-
"VertexAPIKeyProvider",
|
| 16 |
-
"VertexServiceAccountProvider",
|
| 17 |
"list_all_models",
|
| 18 |
"build_model_config",
|
| 19 |
"build_client",
|
|
|
|
| 1 |
"""
|
| 2 |
Services AI β providers Google AI, registre de modΓ¨les, et analyse IA.
|
| 3 |
+
|
| 4 |
+
Les imports de providers sont diffΓ©rΓ©s (lazy) pour Γ©viter de charger les SDK
|
| 5 |
+
tiers (google-genai, mistralai) au dΓ©marrage. Cela permet Γ l'application
|
| 6 |
+
de fonctionner mΓͺme si un SDK n'est pas installΓ©.
|
| 7 |
"""
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def __getattr__(name: str):
|
| 11 |
+
"""Import paresseux β les symboles sont rΓ©solus au premier accΓ¨s."""
|
| 12 |
+
_lazy_map = {
|
| 13 |
+
"run_primary_analysis": "app.services.ai.analyzer",
|
| 14 |
+
"build_client": "app.services.ai.client_factory",
|
| 15 |
+
"build_model_config": "app.services.ai.model_registry",
|
| 16 |
+
"list_all_models": "app.services.ai.model_registry",
|
| 17 |
+
"load_and_render_prompt": "app.services.ai.prompt_loader",
|
| 18 |
+
"parse_ai_response": "app.services.ai.response_parser",
|
| 19 |
+
"ParseError": "app.services.ai.response_parser",
|
| 20 |
+
}
|
| 21 |
+
if name in _lazy_map:
|
| 22 |
+
import importlib
|
| 23 |
+
module = importlib.import_module(_lazy_map[name])
|
| 24 |
+
return getattr(module, name)
|
| 25 |
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
| 26 |
+
|
| 27 |
|
| 28 |
__all__ = [
|
|
|
|
|
|
|
|
|
|
| 29 |
"list_all_models",
|
| 30 |
"build_model_config",
|
| 31 |
"build_client",
|
backend/app/services/ai/analyzer.py
CHANGED
|
@@ -13,8 +13,8 @@ from pathlib import Path
|
|
| 13 |
from app.schemas.corpus_profile import CorpusProfile
|
| 14 |
from app.schemas.image import ImageDerivativeInfo
|
| 15 |
from app.schemas.model_config import ModelConfig
|
| 16 |
-
from app.schemas.page_master import EditorialInfo, EditorialStatus, PageMaster, ProcessingInfo
|
| 17 |
-
from app.services.ai.master_writer import
|
| 18 |
from app.services.ai.model_registry import get_provider
|
| 19 |
from app.services.ai.prompt_loader import load_and_render_prompt
|
| 20 |
from app.services.ai.response_parser import ParseError, parse_ai_response # noqa: F401
|
|
@@ -37,7 +37,7 @@ def run_primary_analysis(
|
|
| 37 |
) -> PageMaster:
|
| 38 |
"""Analyse primaire d'un folio : charge le prompt, appelle l'IA, Γ©crit les fichiers.
|
| 39 |
|
| 40 |
-
Respecte R05 :
|
| 41 |
d'erreur de parsing. master.json n'est Γ©crit QUE si le parsing a rΓ©ussi.
|
| 42 |
|
| 43 |
Le provider est sΓ©lectionnΓ© dynamiquement depuis model_config.provider ;
|
|
@@ -57,7 +57,7 @@ def run_primary_analysis(
|
|
| 57 |
project_root: racine du projet (pour rΓ©soudre les chemins des prompts).
|
| 58 |
|
| 59 |
Returns:
|
| 60 |
-
PageMaster validΓ© (
|
| 61 |
|
| 62 |
Raises:
|
| 63 |
ParseError: si la rΓ©ponse IA n'est pas un JSON valide.
|
|
@@ -66,7 +66,7 @@ def run_primary_analysis(
|
|
| 66 |
"""
|
| 67 |
# ββ Chemins de sortie βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
|
| 69 |
-
raw_path = page_dir / "
|
| 70 |
master_path = page_dir / "master.json"
|
| 71 |
|
| 72 |
# ββ 1. Chargement et rendu du prompt (R04) ββββββββββββββββββββββββββββββ
|
|
@@ -76,6 +76,7 @@ def run_primary_analysis(
|
|
| 76 |
context = {
|
| 77 |
"profile_label": corpus_profile.label,
|
| 78 |
"language_hints": ", ".join(corpus_profile.language_hints),
|
|
|
|
| 79 |
"script_type": corpus_profile.script_type.value,
|
| 80 |
}
|
| 81 |
prompt_text = load_and_render_prompt(prompt_abs_path, context)
|
|
@@ -85,7 +86,12 @@ def run_primary_analysis(
|
|
| 85 |
)
|
| 86 |
|
| 87 |
# ββ 2. Chargement de l'image dΓ©rivΓ©e ββββββββββββββββββββββββββββββββββββ
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# ββ 3. Appel IA via le provider sΓ©lectionnΓ© βββββββββββββββββββββββββββββ
|
| 91 |
provider = get_provider(model_config.provider)
|
|
@@ -104,8 +110,8 @@ def run_primary_analysis(
|
|
| 104 |
model_id=model_config.selected_model_id,
|
| 105 |
)
|
| 106 |
|
| 107 |
-
# ββ 4. Γcriture
|
| 108 |
-
|
| 109 |
|
| 110 |
# ββ 5. Parsing + validation (ParseError si JSON invalide) βββββββββββββββ
|
| 111 |
layout, ocr = parse_ai_response(raw_text)
|
|
@@ -118,16 +124,17 @@ def run_primary_analysis(
|
|
| 118 |
manuscript_id=manuscript_id,
|
| 119 |
folio_label=folio_label,
|
| 120 |
sequence=sequence,
|
| 121 |
-
image=
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
layout=layout,
|
| 129 |
ocr=ocr,
|
| 130 |
processing=ProcessingInfo(
|
|
|
|
| 131 |
model_id=model_config.selected_model_id,
|
| 132 |
model_display_name=model_config.selected_model_display_name,
|
| 133 |
prompt_version=prompt_rel_path,
|
|
|
|
| 13 |
from app.schemas.corpus_profile import CorpusProfile
|
| 14 |
from app.schemas.image import ImageDerivativeInfo
|
| 15 |
from app.schemas.model_config import ModelConfig
|
| 16 |
+
from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
|
| 17 |
+
from app.services.ai.master_writer import write_ai_raw, write_master_json
|
| 18 |
from app.services.ai.model_registry import get_provider
|
| 19 |
from app.services.ai.prompt_loader import load_and_render_prompt
|
| 20 |
from app.services.ai.response_parser import ParseError, parse_ai_response # noqa: F401
|
|
|
|
| 37 |
) -> PageMaster:
|
| 38 |
"""Analyse primaire d'un folio : charge le prompt, appelle l'IA, Γ©crit les fichiers.
|
| 39 |
|
| 40 |
+
Respecte R05 : ai_raw.json est toujours Γ©crit en premier, mΓͺme en cas
|
| 41 |
d'erreur de parsing. master.json n'est Γ©crit QUE si le parsing a rΓ©ussi.
|
| 42 |
|
| 43 |
Le provider est sΓ©lectionnΓ© dynamiquement depuis model_config.provider ;
|
|
|
|
| 57 |
project_root: racine du projet (pour rΓ©soudre les chemins des prompts).
|
| 58 |
|
| 59 |
Returns:
|
| 60 |
+
PageMaster validΓ© (ai_raw.json et master.json Γ©crits sur disque).
|
| 61 |
|
| 62 |
Raises:
|
| 63 |
ParseError: si la rΓ©ponse IA n'est pas un JSON valide.
|
|
|
|
| 66 |
"""
|
| 67 |
# ββ Chemins de sortie βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
|
| 69 |
+
raw_path = page_dir / "ai_raw.json"
|
| 70 |
master_path = page_dir / "master.json"
|
| 71 |
|
| 72 |
# ββ 1. Chargement et rendu du prompt (R04) ββββββββββββββββββββββββββββββ
|
|
|
|
| 76 |
context = {
|
| 77 |
"profile_label": corpus_profile.label,
|
| 78 |
"language_hints": ", ".join(corpus_profile.language_hints),
|
| 79 |
+
"primary_language": corpus_profile.language_hints[0] if corpus_profile.language_hints else "la",
|
| 80 |
"script_type": corpus_profile.script_type.value,
|
| 81 |
}
|
| 82 |
prompt_text = load_and_render_prompt(prompt_abs_path, context)
|
|
|
|
| 86 |
)
|
| 87 |
|
| 88 |
# ββ 2. Chargement de l'image dΓ©rivΓ©e ββββββββββββββββββββββββββββββββββββ
|
| 89 |
+
if not derivative_image_path.exists():
|
| 90 |
+
raise FileNotFoundError(f"Image dΓ©rivΓ©e introuvable : {derivative_image_path}")
|
| 91 |
+
try:
|
| 92 |
+
jpeg_bytes = derivative_image_path.read_bytes()
|
| 93 |
+
except OSError as exc:
|
| 94 |
+
raise RuntimeError(f"Erreur lecture image {derivative_image_path} : {exc}") from exc
|
| 95 |
|
| 96 |
# ββ 3. Appel IA via le provider sΓ©lectionnΓ© βββββββββββββββββββββββββββββ
|
| 97 |
provider = get_provider(model_config.provider)
|
|
|
|
| 110 |
model_id=model_config.selected_model_id,
|
| 111 |
)
|
| 112 |
|
| 113 |
+
# ββ 4. Γcriture ai_raw.json TOUJOURS EN PREMIER (R05) βββββββββββββββββ
|
| 114 |
+
write_ai_raw(raw_text, raw_path)
|
| 115 |
|
| 116 |
# ββ 5. Parsing + validation (ParseError si JSON invalide) βββββββββββββββ
|
| 117 |
layout, ocr = parse_ai_response(raw_text)
|
|
|
|
| 124 |
manuscript_id=manuscript_id,
|
| 125 |
folio_label=folio_label,
|
| 126 |
sequence=sequence,
|
| 127 |
+
image=ImageInfo(
|
| 128 |
+
master=image_info.original_url,
|
| 129 |
+
derivative_web=image_info.derivative_path,
|
| 130 |
+
thumbnail=image_info.thumbnail_path,
|
| 131 |
+
width=image_info.derivative_width,
|
| 132 |
+
height=image_info.derivative_height,
|
| 133 |
+
),
|
| 134 |
layout=layout,
|
| 135 |
ocr=ocr,
|
| 136 |
processing=ProcessingInfo(
|
| 137 |
+
provider=model_config.provider.value if hasattr(model_config.provider, "value") else str(model_config.provider),
|
| 138 |
model_id=model_config.selected_model_id,
|
| 139 |
model_display_name=model_config.selected_model_display_name,
|
| 140 |
prompt_version=prompt_rel_path,
|
backend/app/services/ai/master_writer.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
-
Γcriture des fichiers
|
| 3 |
|
| 4 |
Règle R05 non négociable :
|
| 5 |
-
1.
|
| 6 |
2. master.json n'est Γ©crit QUE si le parsing et la validation Pydantic ont rΓ©ussi.
|
| 7 |
"""
|
| 8 |
# 1. stdlib
|
|
@@ -16,24 +16,24 @@ from app.schemas.page_master import PageMaster
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
|
| 19 |
-
def
|
| 20 |
-
"""Γcrit la rΓ©ponse brute de l'IA dans
|
| 21 |
|
| 22 |
Toujours appelΓ© AVANT toute tentative de parsing.
|
| 23 |
Le contenu est enveloppΓ© dans un objet JSON pour garantir un fichier valide,
|
| 24 |
mΓͺme si la rΓ©ponse IA n'est pas du JSON.
|
| 25 |
-
|
| 26 |
-
Args:
|
| 27 |
-
raw_text: texte brut retournΓ© par l'API Google AI.
|
| 28 |
-
output_path: chemin complet du fichier de sortie (gemini_raw.json).
|
| 29 |
"""
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def write_master_json(page_master: PageMaster, output_path: Path) -> None:
|
|
@@ -41,14 +41,14 @@ def write_master_json(page_master: PageMaster, output_path: Path) -> None:
|
|
| 41 |
|
| 42 |
N'est appelΓ© QUE si le parsing et la validation Pydantic ont rΓ©ussi.
|
| 43 |
CrΓ©e les dossiers parents si nΓ©cessaire.
|
| 44 |
-
|
| 45 |
-
Args:
|
| 46 |
-
page_master: instance PageMaster validΓ©e par Pydantic.
|
| 47 |
-
output_path: chemin complet du fichier de sortie (master.json).
|
| 48 |
"""
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
logger.info("master.json Γ©crit", extra={"path": str(output_path)})
|
|
|
|
| 1 |
"""
|
| 2 |
+
Γcriture des fichiers ai_raw.json et master.json (R02, R05).
|
| 3 |
|
| 4 |
Règle R05 non négociable :
|
| 5 |
+
1. ai_raw.json est TOUJOURS Γ©crit en premier.
|
| 6 |
2. master.json n'est Γ©crit QUE si le parsing et la validation Pydantic ont rΓ©ussi.
|
| 7 |
"""
|
| 8 |
# 1. stdlib
|
|
|
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
|
| 19 |
+
def write_ai_raw(raw_text: str, output_path: Path) -> None:
|
| 20 |
+
"""Γcrit la rΓ©ponse brute de l'IA dans ai_raw.json (R05).
|
| 21 |
|
| 22 |
Toujours appelΓ© AVANT toute tentative de parsing.
|
| 23 |
Le contenu est enveloppΓ© dans un objet JSON pour garantir un fichier valide,
|
| 24 |
mΓͺme si la rΓ©ponse IA n'est pas du JSON.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
+
try:
|
| 27 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
payload = {"response_text": raw_text}
|
| 29 |
+
output_path.write_text(
|
| 30 |
+
json.dumps(payload, ensure_ascii=False, indent=2),
|
| 31 |
+
encoding="utf-8",
|
| 32 |
+
)
|
| 33 |
+
except OSError as exc:
|
| 34 |
+
logger.error("Γcriture ai_raw.json Γ©chouΓ©e", extra={"path": str(output_path), "error": str(exc)})
|
| 35 |
+
raise
|
| 36 |
+
logger.info("ai_raw.json Γ©crit", extra={"path": str(output_path)})
|
| 37 |
|
| 38 |
|
| 39 |
def write_master_json(page_master: PageMaster, output_path: Path) -> None:
|
|
|
|
| 41 |
|
| 42 |
N'est appelΓ© QUE si le parsing et la validation Pydantic ont rΓ©ussi.
|
| 43 |
CrΓ©e les dossiers parents si nΓ©cessaire.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""
|
| 45 |
+
try:
|
| 46 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
output_path.write_text(
|
| 48 |
+
page_master.model_dump_json(indent=2),
|
| 49 |
+
encoding="utf-8",
|
| 50 |
+
)
|
| 51 |
+
except OSError as exc:
|
| 52 |
+
logger.error("Γcriture master.json Γ©chouΓ©e", extra={"path": str(output_path), "error": str(exc)})
|
| 53 |
+
raise
|
| 54 |
logger.info("master.json Γ©crit", extra={"path": str(output_path)})
|
backend/app/services/ai/model_registry.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
Registre agrégé des modèles disponibles tous providers confondus.
|
|
|
|
|
|
|
|
|
|
| 3 |
"""
|
| 4 |
# 1. stdlib
|
| 5 |
import logging
|
|
@@ -8,10 +11,6 @@ from datetime import datetime, timezone
|
|
| 8 |
# 2. local
|
| 9 |
from app.schemas.model_config import ModelConfig, ModelInfo, ProviderType
|
| 10 |
from app.services.ai.base import AIProvider
|
| 11 |
-
from app.services.ai.provider_google_ai import GoogleAIProvider
|
| 12 |
-
from app.services.ai.provider_mistral import MistralProvider
|
| 13 |
-
from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
|
| 14 |
-
from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
|
@@ -24,13 +23,27 @@ _PROVIDER_DISPLAY_NAMES: dict[ProviderType, str] = {
|
|
| 24 |
}
|
| 25 |
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
def _build_providers() -> list[AIProvider]:
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
GoogleAIProvider(),
|
| 30 |
VertexAPIKeyProvider(),
|
| 31 |
VertexServiceAccountProvider(),
|
| 32 |
MistralProvider(),
|
| 33 |
]
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
def get_available_providers() -> list[dict]:
|
|
|
|
| 1 |
"""
|
| 2 |
Registre agrégé des modèles disponibles tous providers confondus.
|
| 3 |
+
|
| 4 |
+
Les imports de providers sont diffΓ©rΓ©s dans _build_providers() pour Γ©viter
|
| 5 |
+
de charger les SDK tiers (google-genai, mistralai) au niveau module.
|
| 6 |
"""
|
| 7 |
# 1. stdlib
|
| 8 |
import logging
|
|
|
|
| 11 |
# 2. local
|
| 12 |
from app.schemas.model_config import ModelConfig, ModelInfo, ProviderType
|
| 13 |
from app.services.ai.base import AIProvider
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
|
|
|
| 23 |
}
|
| 24 |
|
| 25 |
|
| 26 |
+
_cached_providers: list[AIProvider] | None = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
def _build_providers() -> list[AIProvider]:
|
| 30 |
+
"""Construit la liste des providers β imports diffΓ©rΓ©s, rΓ©sultat mis en cache."""
|
| 31 |
+
global _cached_providers
|
| 32 |
+
if _cached_providers is not None:
|
| 33 |
+
return _cached_providers
|
| 34 |
+
|
| 35 |
+
from app.services.ai.provider_google_ai import GoogleAIProvider
|
| 36 |
+
from app.services.ai.provider_mistral import MistralProvider
|
| 37 |
+
from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
|
| 38 |
+
from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
|
| 39 |
+
|
| 40 |
+
_cached_providers = [
|
| 41 |
GoogleAIProvider(),
|
| 42 |
VertexAPIKeyProvider(),
|
| 43 |
VertexServiceAccountProvider(),
|
| 44 |
MistralProvider(),
|
| 45 |
]
|
| 46 |
+
return _cached_providers
|
| 47 |
|
| 48 |
|
| 49 |
def get_available_providers() -> list[dict]:
|
backend/app/services/ai/prompt_loader.py
CHANGED
|
@@ -6,6 +6,7 @@ Le code charge le fichier, substitue les variables {{nom}}, envoie Γ l'API.
|
|
| 6 |
"""
|
| 7 |
# 1. stdlib
|
| 8 |
import logging
|
|
|
|
| 9 |
from pathlib import Path
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
|
@@ -38,6 +39,11 @@ def load_and_render_prompt(template_path: str | Path, context: dict[str, str]) -
|
|
| 38 |
for key, value in context.items():
|
| 39 |
rendered = rendered.replace("{{" + key + "}}", value)
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
logger.debug(
|
| 42 |
"Prompt chargΓ© et rendu",
|
| 43 |
extra={"template": str(path), "variables": list(context.keys())},
|
|
|
|
| 6 |
"""
|
| 7 |
# 1. stdlib
|
| 8 |
import logging
|
| 9 |
+
import re
|
| 10 |
from pathlib import Path
|
| 11 |
|
| 12 |
logger = logging.getLogger(__name__)
|
|
|
|
| 39 |
for key, value in context.items():
|
| 40 |
rendered = rendered.replace("{{" + key + "}}", value)
|
| 41 |
|
| 42 |
+
# VΓ©rifier qu'il ne reste pas de variables non rΓ©solues (CLAUDE.md Β§8)
|
| 43 |
+
unresolved = re.findall(r"\{\{\w+\}\}", rendered)
|
| 44 |
+
if unresolved:
|
| 45 |
+
raise ValueError(f"Variables non rΓ©solues dans le prompt : {unresolved}")
|
| 46 |
+
|
| 47 |
logger.debug(
|
| 48 |
"Prompt chargΓ© et rendu",
|
| 49 |
extra={"template": str(path), "variables": list(context.keys())},
|
backend/app/services/ai/provider_google_ai.py
CHANGED
|
@@ -60,8 +60,15 @@ class GoogleAIProvider(AIProvider):
|
|
| 60 |
raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
|
| 61 |
client = genai.Client(api_key=os.environ[_ENV_KEY])
|
| 62 |
image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
return response.text or ""
|
|
|
|
| 60 |
raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
|
| 61 |
client = genai.Client(api_key=os.environ[_ENV_KEY])
|
| 62 |
image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
|
| 63 |
+
try:
|
| 64 |
+
response = client.models.generate_content(
|
| 65 |
+
model=model_id,
|
| 66 |
+
contents=[image_part, prompt],
|
| 67 |
+
)
|
| 68 |
+
except Exception as exc:
|
| 69 |
+
logger.error(
|
| 70 |
+
"Appel API Google AI Studio Γ©chouΓ©",
|
| 71 |
+
extra={"model": model_id, "error": str(exc)},
|
| 72 |
+
)
|
| 73 |
+
raise RuntimeError(f"Erreur API Google AI Studio ({model_id}) : {exc}") from exc
|
| 74 |
return response.text or ""
|
backend/app/services/ai/provider_mistral.py
CHANGED
|
@@ -208,10 +208,14 @@ class MistralProvider(AIProvider):
|
|
| 208 |
# ββ Chemin 1 : OCR dΓ©diΓ© βββββββββββββββββββββββββββββββββββββββββββββ
|
| 209 |
if _is_ocr_model(model_id):
|
| 210 |
logger.info("Mistral OCR : endpoint dΓ©diΓ© client.ocr.process()", extra={"model": model_id})
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
# OCRResponse.pages : list[OCRPageObject], chacun avec .markdown
|
| 216 |
pages = getattr(response, "pages", []) or []
|
| 217 |
return "\n\n".join(
|
|
@@ -233,10 +237,14 @@ class MistralProvider(AIProvider):
|
|
| 233 |
)
|
| 234 |
content = prompt
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
choices = response.choices or []
|
| 241 |
if not choices:
|
| 242 |
return ""
|
|
|
|
| 208 |
# ββ Chemin 1 : OCR dΓ©diΓ© βββββββββββββββββββββββββββββββββββββββββββββ
|
| 209 |
if _is_ocr_model(model_id):
|
| 210 |
logger.info("Mistral OCR : endpoint dΓ©diΓ© client.ocr.process()", extra={"model": model_id})
|
| 211 |
+
try:
|
| 212 |
+
response = client.ocr.process(
|
| 213 |
+
model=model_id,
|
| 214 |
+
document={"type": "image_url", "image_url": {"url": data_url}},
|
| 215 |
+
)
|
| 216 |
+
except Exception as exc:
|
| 217 |
+
logger.error("Appel Mistral OCR Γ©chouΓ©", extra={"model": model_id, "error": str(exc)})
|
| 218 |
+
raise RuntimeError(f"Erreur API Mistral OCR ({model_id}) : {exc}") from exc
|
| 219 |
# OCRResponse.pages : list[OCRPageObject], chacun avec .markdown
|
| 220 |
pages = getattr(response, "pages", []) or []
|
| 221 |
return "\n\n".join(
|
|
|
|
| 237 |
)
|
| 238 |
content = prompt
|
| 239 |
|
| 240 |
+
try:
|
| 241 |
+
response = client.chat.complete(
|
| 242 |
+
model=model_id,
|
| 243 |
+
messages=[{"role": "user", "content": content}],
|
| 244 |
+
)
|
| 245 |
+
except Exception as exc:
|
| 246 |
+
logger.error("Appel Mistral chat Γ©chouΓ©", extra={"model": model_id, "error": str(exc)})
|
| 247 |
+
raise RuntimeError(f"Erreur API Mistral ({model_id}) : {exc}") from exc
|
| 248 |
choices = response.choices or []
|
| 249 |
if not choices:
|
| 250 |
return ""
|
backend/app/services/ai/provider_vertex_key.py
CHANGED
|
@@ -22,9 +22,6 @@ retourne toujours False afin d'Γ©viter des appels rΓ©seau vouΓ©s Γ l'Γ©chec.
|
|
| 22 |
import logging
|
| 23 |
import os
|
| 24 |
|
| 25 |
-
# 2. third-party
|
| 26 |
-
from google.genai import types # noqa: F401 (conservΓ© pour import cohΓ©rence)
|
| 27 |
-
|
| 28 |
# 3. local
|
| 29 |
from app.schemas.model_config import ModelInfo, ProviderType
|
| 30 |
from app.services.ai.base import AIProvider
|
|
|
|
| 22 |
import logging
|
| 23 |
import os
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
# 3. local
|
| 26 |
from app.schemas.model_config import ModelInfo, ProviderType
|
| 27 |
from app.services.ai.base import AIProvider
|
backend/app/services/ai/provider_vertex_sa.py
CHANGED
|
@@ -90,8 +90,15 @@ class VertexServiceAccountProvider(AIProvider):
|
|
| 90 |
raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
|
| 91 |
client = self._build_client()
|
| 92 |
image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
return response.text or ""
|
|
|
|
| 90 |
raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
|
| 91 |
client = self._build_client()
|
| 92 |
image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
|
| 93 |
+
try:
|
| 94 |
+
response = client.models.generate_content(
|
| 95 |
+
model=model_id,
|
| 96 |
+
contents=[image_part, prompt],
|
| 97 |
+
)
|
| 98 |
+
except Exception as exc:
|
| 99 |
+
logger.error(
|
| 100 |
+
"Appel API Vertex AI Γ©chouΓ©",
|
| 101 |
+
extra={"model": model_id, "error": str(exc)},
|
| 102 |
+
)
|
| 103 |
+
raise RuntimeError(f"Erreur API Vertex AI ({model_id}) : {exc}") from exc
|
| 104 |
return response.text or ""
|
backend/app/services/corpus_runner.py
CHANGED
|
@@ -17,7 +17,6 @@ from sqlalchemy import select
|
|
| 17 |
# 3. local
|
| 18 |
from app.models.database import async_session_factory
|
| 19 |
from app.models.job import JobModel
|
| 20 |
-
from app.services.job_runner import execute_page_job
|
| 21 |
|
| 22 |
logger = logging.getLogger(__name__)
|
| 23 |
|
|
@@ -54,6 +53,8 @@ async def execute_corpus_job(corpus_id: str) -> dict:
|
|
| 54 |
)
|
| 55 |
|
| 56 |
# ExΓ©cution sΓ©quentielle β chaque job gΓ¨re sa propre session
|
|
|
|
|
|
|
| 57 |
for job_id in job_ids:
|
| 58 |
await execute_page_job(job_id)
|
| 59 |
|
|
|
|
| 17 |
# 3. local
|
| 18 |
from app.models.database import async_session_factory
|
| 19 |
from app.models.job import JobModel
|
|
|
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
| 22 |
|
|
|
|
| 53 |
)
|
| 54 |
|
| 55 |
# ExΓ©cution sΓ©quentielle β chaque job gΓ¨re sa propre session
|
| 56 |
+
from app.services.job_runner import execute_page_job
|
| 57 |
+
|
| 58 |
for job_id in job_ids:
|
| 59 |
await execute_page_job(job_id)
|
| 60 |
|
backend/app/services/export/alto.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
GΓ©nΓ©rateur ALTO v4 depuis un PageMaster validΓ© (R02).
|
| 3 |
|
| 4 |
-
Source canonique : PageMaster uniquement β jamais la rΓ©ponse brute
|
| 5 |
bbox [x, y, width, height] β HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
|
| 6 |
|
| 7 |
Mapping RegionType β Γ©lΓ©ment ALTO :
|
|
@@ -82,7 +82,7 @@ def _build_text_block(
|
|
| 82 |
text = fallback_text
|
| 83 |
|
| 84 |
if not text:
|
| 85 |
-
return # TextBlock
|
| 86 |
|
| 87 |
x, y, w, h = region.bbox
|
| 88 |
line_el = etree.SubElement(
|
|
@@ -160,11 +160,7 @@ def generate_alto(master: PageMaster) -> str:
|
|
| 160 |
etree.SubElement(desc, _a("MeasurementUnit")).text = "pixel"
|
| 161 |
|
| 162 |
src_info = etree.SubElement(desc, _a("sourceImageInformation"))
|
| 163 |
-
file_name =
|
| 164 |
-
master.image.get("original_url")
|
| 165 |
-
or master.image.get("derivative_web")
|
| 166 |
-
or master.page_id
|
| 167 |
-
)
|
| 168 |
etree.SubElement(src_info, _a("fileName")).text = str(file_name)
|
| 169 |
|
| 170 |
if master.processing:
|
|
@@ -185,8 +181,8 @@ def generate_alto(master: PageMaster) -> str:
|
|
| 185 |
# ββ Layout βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
layout_el = etree.SubElement(root, _a("Layout"))
|
| 187 |
|
| 188 |
-
width =
|
| 189 |
-
height =
|
| 190 |
|
| 191 |
page_id_safe = master.page_id.replace(" ", "_")
|
| 192 |
page_el = etree.SubElement(
|
|
|
|
| 1 |
"""
|
| 2 |
GΓ©nΓ©rateur ALTO v4 depuis un PageMaster validΓ© (R02).
|
| 3 |
|
| 4 |
+
Source canonique : PageMaster uniquement β jamais la rΓ©ponse brute ai_raw.json.
|
| 5 |
bbox [x, y, width, height] β HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
|
| 6 |
|
| 7 |
Mapping RegionType β Γ©lΓ©ment ALTO :
|
|
|
|
| 82 |
text = fallback_text
|
| 83 |
|
| 84 |
if not text:
|
| 85 |
+
return # TextBlock sans TextLine β valide ALTO, rΓ©gion visible dans le layout
|
| 86 |
|
| 87 |
x, y, w, h = region.bbox
|
| 88 |
line_el = etree.SubElement(
|
|
|
|
| 160 |
etree.SubElement(desc, _a("MeasurementUnit")).text = "pixel"
|
| 161 |
|
| 162 |
src_info = etree.SubElement(desc, _a("sourceImageInformation"))
|
| 163 |
+
file_name = master.image.master or master.image.derivative_web or master.page_id
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
etree.SubElement(src_info, _a("fileName")).text = str(file_name)
|
| 165 |
|
| 166 |
if master.processing:
|
|
|
|
| 181 |
# ββ Layout βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 182 |
layout_el = etree.SubElement(root, _a("Layout"))
|
| 183 |
|
| 184 |
+
width = master.image.width
|
| 185 |
+
height = master.image.height
|
| 186 |
|
| 187 |
page_id_safe = master.page_id.replace(" ", "_")
|
| 188 |
page_el = etree.SubElement(
|
backend/app/services/export/iiif.py
CHANGED
|
@@ -74,7 +74,7 @@ def generate_manifest(
|
|
| 74 |
|
| 75 |
manuscript_id = manuscript_meta["manuscript_id"]
|
| 76 |
label = manuscript_meta["label"]
|
| 77 |
-
language = manuscript_meta.get("language") or "
|
| 78 |
|
| 79 |
# Pages dans l'ordre de sΓ©quence (rΓ¨gle absolue β structMap PHYSICAL)
|
| 80 |
pages = sorted(masters, key=lambda m: m.sequence)
|
|
@@ -102,17 +102,17 @@ def generate_manifest(
|
|
| 102 |
canvas_id = (
|
| 103 |
f"{base_url}/api/v1/manuscripts/{manuscript_id}/canvas/{page.page_id}"
|
| 104 |
)
|
| 105 |
-
width =
|
| 106 |
-
height =
|
| 107 |
|
| 108 |
annotation_page_id = f"{canvas_id}/annotation-page/1"
|
| 109 |
annotation_id = f"{canvas_id}/annotation/painting"
|
| 110 |
-
image_url = page.image.
|
| 111 |
|
| 112 |
canvas: dict = {
|
| 113 |
"id": canvas_id,
|
| 114 |
"type": "Canvas",
|
| 115 |
-
"label": {
|
| 116 |
"width": width,
|
| 117 |
"height": height,
|
| 118 |
"items": [
|
|
|
|
| 74 |
|
| 75 |
manuscript_id = manuscript_meta["manuscript_id"]
|
| 76 |
label = manuscript_meta["label"]
|
| 77 |
+
language = manuscript_meta.get("language") or "en"
|
| 78 |
|
| 79 |
# Pages dans l'ordre de sΓ©quence (rΓ¨gle absolue β structMap PHYSICAL)
|
| 80 |
pages = sorted(masters, key=lambda m: m.sequence)
|
|
|
|
| 102 |
canvas_id = (
|
| 103 |
f"{base_url}/api/v1/manuscripts/{manuscript_id}/canvas/{page.page_id}"
|
| 104 |
)
|
| 105 |
+
width = page.image.width
|
| 106 |
+
height = page.image.height
|
| 107 |
|
| 108 |
annotation_page_id = f"{canvas_id}/annotation-page/1"
|
| 109 |
annotation_id = f"{canvas_id}/annotation/painting"
|
| 110 |
+
image_url = page.image.master or ""
|
| 111 |
|
| 112 |
canvas: dict = {
|
| 113 |
"id": canvas_id,
|
| 114 |
"type": "Canvas",
|
| 115 |
+
"label": {language: [f"Folio {page.folio_label}"]},
|
| 116 |
"width": width,
|
| 117 |
"height": height,
|
| 118 |
"items": [
|
backend/app/services/export/mets.py
CHANGED
|
@@ -182,7 +182,7 @@ def generate_mets(
|
|
| 182 |
f_master = _el(grp_master, f"{_M}file", {"ID": f"IMG_MASTER_{sid}", "MIMETYPE": "image/jpeg"})
|
| 183 |
_el(f_master, f"{_M}FLocat", {
|
| 184 |
"LOCTYPE": "URL",
|
| 185 |
-
f"{_XL}href": page.image.
|
| 186 |
f"{_XL}type": "simple",
|
| 187 |
})
|
| 188 |
|
|
@@ -191,12 +191,17 @@ def generate_mets(
|
|
| 191 |
_el(f_deriv, f"{_M}FLocat", {
|
| 192 |
"LOCTYPE": "OTHER",
|
| 193 |
"OTHERLOCTYPE": "filepath",
|
| 194 |
-
f"{_XL}href": page.image.
|
| 195 |
f"{_XL}type": "simple",
|
| 196 |
})
|
| 197 |
|
| 198 |
-
# ALTO
|
| 199 |
alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
|
| 201 |
_el(f_alto, f"{_M}FLocat", {
|
| 202 |
"LOCTYPE": "OTHER",
|
|
|
|
| 182 |
f_master = _el(grp_master, f"{_M}file", {"ID": f"IMG_MASTER_{sid}", "MIMETYPE": "image/jpeg"})
|
| 183 |
_el(f_master, f"{_M}FLocat", {
|
| 184 |
"LOCTYPE": "URL",
|
| 185 |
+
f"{_XL}href": page.image.master or "",
|
| 186 |
f"{_XL}type": "simple",
|
| 187 |
})
|
| 188 |
|
|
|
|
| 191 |
_el(f_deriv, f"{_M}FLocat", {
|
| 192 |
"LOCTYPE": "OTHER",
|
| 193 |
"OTHERLOCTYPE": "filepath",
|
| 194 |
+
f"{_XL}href": page.image.derivative_web or "",
|
| 195 |
f"{_XL}type": "simple",
|
| 196 |
})
|
| 197 |
|
| 198 |
+
# ALTO (rΓ©fΓ©rence conditionnelle β warning si le fichier n'existe pas encore)
|
| 199 |
alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
|
| 200 |
+
if not Path(alto_p).exists():
|
| 201 |
+
logger.warning(
|
| 202 |
+
"Fichier ALTO absent β la rΓ©fΓ©rence METS sera cassΓ©e tant que l'ALTO n'est pas gΓ©nΓ©rΓ©",
|
| 203 |
+
extra={"alto_path": alto_p, "page_id": page.page_id},
|
| 204 |
+
)
|
| 205 |
f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
|
| 206 |
_el(f_alto, f"{_M}FLocat", {
|
| 207 |
"LOCTYPE": "OTHER",
|
backend/app/services/ingest/iiif_fetcher.py
CHANGED
|
@@ -17,7 +17,6 @@ _HEADERS = {
|
|
| 17 |
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
|
| 18 |
),
|
| 19 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
| 20 |
-
"Referer": "https://gallica.bnf.fr/",
|
| 21 |
}
|
| 22 |
|
| 23 |
|
|
|
|
| 17 |
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
|
| 18 |
),
|
| 19 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
|
|
|
| 20 |
}
|
| 21 |
|
| 22 |
|
backend/app/services/job_runner.py
CHANGED
|
@@ -33,8 +33,6 @@ from app.models.job import JobModel
|
|
| 33 |
from app.models.model_config_db import ModelConfigDB
|
| 34 |
from app.schemas.corpus_profile import CorpusProfile
|
| 35 |
from app.schemas.model_config import ModelConfig, ProviderType
|
| 36 |
-
from app.services.ai.analyzer import run_primary_analysis
|
| 37 |
-
from app.services.export.alto import generate_alto, write_alto
|
| 38 |
from app.services.image.normalizer import create_derivatives, fetch_and_normalize
|
| 39 |
|
| 40 |
logger = logging.getLogger(__name__)
|
|
@@ -148,6 +146,8 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
|
|
| 148 |
)
|
| 149 |
|
| 150 |
# ββ 6. Analyse primaire IA (R05 : double stockage) βββββββββββββββββββ
|
|
|
|
|
|
|
| 151 |
page_master = run_primary_analysis(
|
| 152 |
derivative_image_path=Path(image_info.derivative_path),
|
| 153 |
corpus_profile=corpus_profile,
|
|
@@ -163,6 +163,8 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
|
|
| 163 |
)
|
| 164 |
|
| 165 |
# ββ 7. GΓ©nΓ©rer et Γ©crire l'ALTO XML ββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
| 166 |
alto_xml = generate_alto(page_master)
|
| 167 |
alto_path = (
|
| 168 |
data_dir
|
|
|
|
| 33 |
from app.models.model_config_db import ModelConfigDB
|
| 34 |
from app.schemas.corpus_profile import CorpusProfile
|
| 35 |
from app.schemas.model_config import ModelConfig, ProviderType
|
|
|
|
|
|
|
| 36 |
from app.services.image.normalizer import create_derivatives, fetch_and_normalize
|
| 37 |
|
| 38 |
logger = logging.getLogger(__name__)
|
|
|
|
| 146 |
)
|
| 147 |
|
| 148 |
# ββ 6. Analyse primaire IA (R05 : double stockage) βββββββββββββββββββ
|
| 149 |
+
from app.services.ai.analyzer import run_primary_analysis
|
| 150 |
+
|
| 151 |
page_master = run_primary_analysis(
|
| 152 |
derivative_image_path=Path(image_info.derivative_path),
|
| 153 |
corpus_profile=corpus_profile,
|
|
|
|
| 163 |
)
|
| 164 |
|
| 165 |
# ββ 7. GΓ©nΓ©rer et Γ©crire l'ALTO XML ββββββββββββββββββββββββββββββββββ
|
| 166 |
+
from app.services.export.alto import generate_alto, write_alto
|
| 167 |
+
|
| 168 |
alto_xml = generate_alto(page_master)
|
| 169 |
alto_path = (
|
| 170 |
data_dir
|
backend/pyproject.toml
CHANGED
|
@@ -11,6 +11,7 @@ dependencies = [
|
|
| 11 |
"fastapi>=0.111",
|
| 12 |
"uvicorn[standard]>=0.29",
|
| 13 |
"pydantic>=2.7",
|
|
|
|
| 14 |
"sqlalchemy>=2.0",
|
| 15 |
"aiosqlite>=0.20",
|
| 16 |
"google-genai>=1.0",
|
|
|
|
| 11 |
"fastapi>=0.111",
|
| 12 |
"uvicorn[standard]>=0.29",
|
| 13 |
"pydantic>=2.7",
|
| 14 |
+
"pydantic-settings>=2.0",
|
| 15 |
"sqlalchemy>=2.0",
|
| 16 |
"aiosqlite>=0.20",
|
| 17 |
"google-genai>=1.0",
|
backend/tests/conftest_api.py
CHANGED
|
@@ -51,10 +51,11 @@ async def async_client(db_session: AsyncSession):
|
|
| 51 |
|
| 52 |
app.dependency_overrides[get_db] = _override_get_db
|
| 53 |
# Les background tasks (execute_corpus_job, execute_page_job) crΓ©ent leur
|
| 54 |
-
# propre session via async_session_factory. On les neutralise
|
| 55 |
-
# qu'elles tentent de se connecter Γ la
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
async with AsyncClient(
|
| 59 |
transport=ASGITransport(app=app), base_url="http://test"
|
| 60 |
) as client:
|
|
|
|
| 51 |
|
| 52 |
app.dependency_overrides[get_db] = _override_get_db
|
| 53 |
# Les background tasks (execute_corpus_job, execute_page_job) crΓ©ent leur
|
| 54 |
+
# propre session via async_session_factory. On les neutralise en mockant
|
| 55 |
+
# les modules sources pour Γ©viter qu'elles tentent de se connecter Γ la
|
| 56 |
+
# BDD rΓ©elle pendant les tests d'API.
|
| 57 |
+
with patch("app.services.corpus_runner.execute_corpus_job", AsyncMock(return_value={"total": 0, "done": 0, "failed": 0})), \
|
| 58 |
+
patch("app.services.job_runner.execute_page_job", AsyncMock(return_value=None)):
|
| 59 |
async with AsyncClient(
|
| 60 |
transport=ASGITransport(app=app), base_url="http://test"
|
| 61 |
) as client:
|
backend/tests/test_ai_analyzer.py
CHANGED
|
@@ -3,7 +3,7 @@ Tests du pipeline d'analyse IA :
|
|
| 3 |
- prompt_loader : chargement + rendu des templates
|
| 4 |
- client_factory : construction du genai.Client selon le provider
|
| 5 |
- response_parser: parsing JSON brut β layout + OCRResult
|
| 6 |
-
- master_writer : Γ©criture
|
| 7 |
- analyzer : run_primary_analysis (end-to-end mockΓ©)
|
| 8 |
"""
|
| 9 |
# 1. stdlib
|
|
@@ -31,7 +31,7 @@ from app.schemas.model_config import ModelConfig, ProviderType
|
|
| 31 |
from app.schemas.page_master import OCRResult, PageMaster
|
| 32 |
from app.services.ai.analyzer import run_primary_analysis
|
| 33 |
from app.services.ai.client_factory import build_client
|
| 34 |
-
from app.services.ai.master_writer import
|
| 35 |
from app.services.ai.prompt_loader import load_and_render_prompt
|
| 36 |
from app.services.ai.response_parser import ParseError, parse_ai_response
|
| 37 |
|
|
@@ -390,35 +390,35 @@ def test_parse_empty_regions_list():
|
|
| 390 |
|
| 391 |
|
| 392 |
# ---------------------------------------------------------------------------
|
| 393 |
-
# Tests β
|
| 394 |
# ---------------------------------------------------------------------------
|
| 395 |
|
| 396 |
-
def
|
| 397 |
-
out = tmp_path / "page" / "
|
| 398 |
-
|
| 399 |
|
| 400 |
assert out.exists()
|
| 401 |
|
| 402 |
|
| 403 |
-
def
|
| 404 |
-
out = tmp_path / "
|
| 405 |
-
|
| 406 |
|
| 407 |
content = json.loads(out.read_text(encoding="utf-8"))
|
| 408 |
assert "response_text" in content
|
| 409 |
assert content["response_text"] == '{"not": "valid json from AI"}'
|
| 410 |
|
| 411 |
|
| 412 |
-
def
|
| 413 |
-
out = tmp_path / "deep" / "nested" / "dir" / "
|
| 414 |
-
|
| 415 |
assert out.exists()
|
| 416 |
|
| 417 |
|
| 418 |
-
def
|
| 419 |
-
"""MΓͺme si le texte brut est invalide,
|
| 420 |
-
out = tmp_path / "
|
| 421 |
-
|
| 422 |
|
| 423 |
content = json.loads(out.read_text(encoding="utf-8"))
|
| 424 |
assert content["response_text"] == "this is not json at all"
|
|
@@ -432,7 +432,7 @@ def _make_page_master() -> PageMaster:
|
|
| 432 |
folio_label="0001r",
|
| 433 |
sequence=1,
|
| 434 |
image={
|
| 435 |
-
"
|
| 436 |
"derivative_web": "/data/deriv.jpg",
|
| 437 |
"thumbnail": "/data/thumb.jpg",
|
| 438 |
"width": 1500,
|
|
@@ -440,10 +440,11 @@ def _make_page_master() -> PageMaster:
|
|
| 440 |
},
|
| 441 |
layout={"regions": []},
|
| 442 |
processing={
|
|
|
|
| 443 |
"model_id": "gemini-2.0-flash",
|
| 444 |
"model_display_name": "Gemini 2.0 Flash",
|
| 445 |
"prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
|
| 446 |
-
"raw_response_path": "/data/
|
| 447 |
"processed_at": datetime.now(tz=timezone.utc),
|
| 448 |
},
|
| 449 |
)
|
|
@@ -568,12 +569,12 @@ def test_run_primary_analysis_files_created(tmp_path):
|
|
| 568 |
)
|
| 569 |
|
| 570 |
page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
|
| 571 |
-
assert (page_dir / "
|
| 572 |
assert (page_dir / "master.json").exists()
|
| 573 |
|
| 574 |
|
| 575 |
def test_run_primary_analysis_raw_written_before_parse(tmp_path):
|
| 576 |
-
"""
|
| 577 |
prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
|
| 578 |
_setup_prompt_file(tmp_path, prompt_rel)
|
| 579 |
deriv_path = _setup_derivative(tmp_path)
|
|
@@ -596,8 +597,8 @@ def test_run_primary_analysis_raw_written_before_parse(tmp_path):
|
|
| 596 |
project_root=tmp_path,
|
| 597 |
)
|
| 598 |
|
| 599 |
-
#
|
| 600 |
-
raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "
|
| 601 |
assert raw_path.exists()
|
| 602 |
|
| 603 |
# master.json N'existe PAS (parsing a οΏ½οΏ½chouΓ©)
|
|
@@ -663,9 +664,9 @@ def test_run_primary_analysis_image_dict(tmp_path):
|
|
| 663 |
project_root=tmp_path,
|
| 664 |
)
|
| 665 |
|
| 666 |
-
assert result.image
|
| 667 |
-
assert result.image
|
| 668 |
-
assert result.image
|
| 669 |
|
| 670 |
|
| 671 |
def test_run_primary_analysis_regions_in_layout(tmp_path):
|
|
|
|
| 3 |
- prompt_loader : chargement + rendu des templates
|
| 4 |
- client_factory : construction du genai.Client selon le provider
|
| 5 |
- response_parser: parsing JSON brut β layout + OCRResult
|
| 6 |
+
- master_writer : Γ©criture ai_raw.json et master.json
|
| 7 |
- analyzer : run_primary_analysis (end-to-end mockΓ©)
|
| 8 |
"""
|
| 9 |
# 1. stdlib
|
|
|
|
| 31 |
from app.schemas.page_master import OCRResult, PageMaster
|
| 32 |
from app.services.ai.analyzer import run_primary_analysis
|
| 33 |
from app.services.ai.client_factory import build_client
|
| 34 |
+
from app.services.ai.master_writer import write_ai_raw, write_master_json
|
| 35 |
from app.services.ai.prompt_loader import load_and_render_prompt
|
| 36 |
from app.services.ai.response_parser import ParseError, parse_ai_response
|
| 37 |
|
|
|
|
| 390 |
|
| 391 |
|
| 392 |
# ---------------------------------------------------------------------------
|
| 393 |
+
# Tests β write_ai_raw / write_master_json
|
| 394 |
# ---------------------------------------------------------------------------
|
| 395 |
|
| 396 |
+
def test_write_ai_raw_creates_file(tmp_path):
|
| 397 |
+
out = tmp_path / "page" / "ai_raw.json"
|
| 398 |
+
write_ai_raw("raw AI text here", out)
|
| 399 |
|
| 400 |
assert out.exists()
|
| 401 |
|
| 402 |
|
| 403 |
+
def test_write_ai_raw_valid_json(tmp_path):
|
| 404 |
+
out = tmp_path / "ai_raw.json"
|
| 405 |
+
write_ai_raw('{"not": "valid json from AI"}', out)
|
| 406 |
|
| 407 |
content = json.loads(out.read_text(encoding="utf-8"))
|
| 408 |
assert "response_text" in content
|
| 409 |
assert content["response_text"] == '{"not": "valid json from AI"}'
|
| 410 |
|
| 411 |
|
| 412 |
+
def test_write_ai_raw_creates_parent_dirs(tmp_path):
|
| 413 |
+
out = tmp_path / "deep" / "nested" / "dir" / "ai_raw.json"
|
| 414 |
+
write_ai_raw("text", out)
|
| 415 |
assert out.exists()
|
| 416 |
|
| 417 |
|
| 418 |
+
def test_write_ai_raw_with_non_json_text(tmp_path):
|
| 419 |
+
"""MΓͺme si le texte brut est invalide, ai_raw.json est créé."""
|
| 420 |
+
out = tmp_path / "ai_raw.json"
|
| 421 |
+
write_ai_raw("this is not json at all", out)
|
| 422 |
|
| 423 |
content = json.loads(out.read_text(encoding="utf-8"))
|
| 424 |
assert content["response_text"] == "this is not json at all"
|
|
|
|
| 432 |
folio_label="0001r",
|
| 433 |
sequence=1,
|
| 434 |
image={
|
| 435 |
+
"master": "https://example.com/img.jpg",
|
| 436 |
"derivative_web": "/data/deriv.jpg",
|
| 437 |
"thumbnail": "/data/thumb.jpg",
|
| 438 |
"width": 1500,
|
|
|
|
| 440 |
},
|
| 441 |
layout={"regions": []},
|
| 442 |
processing={
|
| 443 |
+
"provider": "google_ai_studio",
|
| 444 |
"model_id": "gemini-2.0-flash",
|
| 445 |
"model_display_name": "Gemini 2.0 Flash",
|
| 446 |
"prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
|
| 447 |
+
"raw_response_path": "/data/ai_raw.json",
|
| 448 |
"processed_at": datetime.now(tz=timezone.utc),
|
| 449 |
},
|
| 450 |
)
|
|
|
|
| 569 |
)
|
| 570 |
|
| 571 |
page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
|
| 572 |
+
assert (page_dir / "ai_raw.json").exists()
|
| 573 |
assert (page_dir / "master.json").exists()
|
| 574 |
|
| 575 |
|
| 576 |
def test_run_primary_analysis_raw_written_before_parse(tmp_path):
|
| 577 |
+
"""ai_raw.json est Γ©crit AVANT que le parsing Γ©choue (R05)."""
|
| 578 |
prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
|
| 579 |
_setup_prompt_file(tmp_path, prompt_rel)
|
| 580 |
deriv_path = _setup_derivative(tmp_path)
|
|
|
|
| 597 |
project_root=tmp_path,
|
| 598 |
)
|
| 599 |
|
| 600 |
+
# ai_raw.json existe malgrΓ© l'Γ©chec de parsing
|
| 601 |
+
raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "ai_raw.json"
|
| 602 |
assert raw_path.exists()
|
| 603 |
|
| 604 |
# master.json N'existe PAS (parsing a οΏ½οΏ½chouΓ©)
|
|
|
|
| 664 |
project_root=tmp_path,
|
| 665 |
)
|
| 666 |
|
| 667 |
+
assert result.image.master == image_info.original_url
|
| 668 |
+
assert result.image.width == image_info.derivative_width
|
| 669 |
+
assert result.image.height == image_info.derivative_height
|
| 670 |
|
| 671 |
|
| 672 |
def test_run_primary_analysis_regions_in_layout(tmp_path):
|
backend/tests/test_api_corrections.py
CHANGED
|
@@ -75,7 +75,7 @@ def _make_master(
|
|
| 75 |
"manuscript_id": "ms-test",
|
| 76 |
"folio_label": "f001r",
|
| 77 |
"sequence": 1,
|
| 78 |
-
"image": {"
|
| 79 |
"layout": {"regions": []},
|
| 80 |
"ocr": {
|
| 81 |
"diplomatic_text": "Incipit liber primus",
|
|
@@ -238,13 +238,13 @@ async def test_corrections_archives_old_version(async_client, db_session, monkey
|
|
| 238 |
ms = await _create_manuscript(db_session, corpus.id)
|
| 239 |
page = await _create_page(db_session, ms.id)
|
| 240 |
|
| 241 |
-
|
| 242 |
|
| 243 |
monkeypatch.setattr(Path, "exists", lambda self: True)
|
| 244 |
monkeypatch.setattr(Path, "read_text", lambda self, **kw: _make_master(page.id, version=1))
|
| 245 |
|
| 246 |
def _capture_write(self: Path, content: str, **kw: object) -> None:
|
| 247 |
-
|
| 248 |
|
| 249 |
monkeypatch.setattr(Path, "write_text", _capture_write)
|
| 250 |
|
|
@@ -254,10 +254,17 @@ async def test_corrections_archives_old_version(async_client, db_session, monkey
|
|
| 254 |
)
|
| 255 |
|
| 256 |
# Deux Γ©critures attendues : master_v1.json (archive) + master.json (nouveau)
|
|
|
|
| 257 |
assert len(written_paths) >= 2
|
| 258 |
assert any("master_v1.json" in p for p in written_paths)
|
| 259 |
assert any("master.json" in p and "master_v" not in p for p in written_paths)
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
@pytest.mark.asyncio
|
| 263 |
async def test_corrections_multiple_fields(async_client, db_session, monkeypatch):
|
|
|
|
| 75 |
"manuscript_id": "ms-test",
|
| 76 |
"folio_label": "f001r",
|
| 77 |
"sequence": 1,
|
| 78 |
+
"image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
|
| 79 |
"layout": {"regions": []},
|
| 80 |
"ocr": {
|
| 81 |
"diplomatic_text": "Incipit liber primus",
|
|
|
|
| 238 |
ms = await _create_manuscript(db_session, corpus.id)
|
| 239 |
page = await _create_page(db_session, ms.id)
|
| 240 |
|
| 241 |
+
written_data: dict[str, str] = {}
|
| 242 |
|
| 243 |
monkeypatch.setattr(Path, "exists", lambda self: True)
|
| 244 |
monkeypatch.setattr(Path, "read_text", lambda self, **kw: _make_master(page.id, version=1))
|
| 245 |
|
| 246 |
def _capture_write(self: Path, content: str, **kw: object) -> None:
|
| 247 |
+
written_data[str(self)] = content
|
| 248 |
|
| 249 |
monkeypatch.setattr(Path, "write_text", _capture_write)
|
| 250 |
|
|
|
|
| 254 |
)
|
| 255 |
|
| 256 |
# Deux Γ©critures attendues : master_v1.json (archive) + master.json (nouveau)
|
| 257 |
+
written_paths = list(written_data.keys())
|
| 258 |
assert len(written_paths) >= 2
|
| 259 |
assert any("master_v1.json" in p for p in written_paths)
|
| 260 |
assert any("master.json" in p and "master_v" not in p for p in written_paths)
|
| 261 |
|
| 262 |
+
# VΓ©rifier que l'archive contient bien la version originale (v1)
|
| 263 |
+
import json as _json
|
| 264 |
+
archive_path = next(p for p in written_paths if "master_v1.json" in p)
|
| 265 |
+
archive_data = _json.loads(written_data[archive_path])
|
| 266 |
+
assert archive_data["editorial"]["version"] == 1
|
| 267 |
+
|
| 268 |
|
| 269 |
@pytest.mark.asyncio
|
| 270 |
async def test_corrections_multiple_fields(async_client, db_session, monkeypatch):
|
backend/tests/test_api_export.py
CHANGED
|
@@ -83,7 +83,7 @@ def _make_master_json(page_id: str, folio_label: str, sequence: int) -> str:
|
|
| 83 |
"folio_label": folio_label,
|
| 84 |
"sequence": sequence,
|
| 85 |
"image": {
|
| 86 |
-
"
|
| 87 |
"derivative_web": f"/data/deriv/{page_id}.jpg",
|
| 88 |
"thumbnail": f"/data/thumb/{page_id}.jpg",
|
| 89 |
"width": 1500,
|
|
|
|
| 83 |
"folio_label": folio_label,
|
| 84 |
"sequence": sequence,
|
| 85 |
"image": {
|
| 86 |
+
"master": f"https://example.com/{page_id}.jpg",
|
| 87 |
"derivative_web": f"/data/deriv/{page_id}.jpg",
|
| 88 |
"thumbnail": f"/data/thumb/{page_id}.jpg",
|
| 89 |
"width": 1500,
|
backend/tests/test_api_ingest.py
CHANGED
|
@@ -457,6 +457,15 @@ async def test_reingest_manifest_skips_existing_pages(async_client, db_session,
|
|
| 457 |
assert data2["pages_created"] == 0
|
| 458 |
assert data2["pages_skipped"] == 2
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
@pytest.mark.asyncio
|
| 462 |
async def test_reingest_images_skips_existing_pages(async_client, db_session):
|
|
|
|
| 457 |
assert data2["pages_created"] == 0
|
| 458 |
assert data2["pages_skipped"] == 2
|
| 459 |
|
| 460 |
+
# VΓ©rifier que la BDD n'a bien que 2 pages (pas de doublons)
|
| 461 |
+
from sqlalchemy import select as sa_select
|
| 462 |
+
from app.models.corpus import PageModel
|
| 463 |
+
page_result = await db_session.execute(
|
| 464 |
+
sa_select(PageModel).where(PageModel.manuscript_id == data1["manuscript_id"])
|
| 465 |
+
)
|
| 466 |
+
pages_in_db = list(page_result.scalars().all())
|
| 467 |
+
assert len(pages_in_db) == 2
|
| 468 |
+
|
| 469 |
|
| 470 |
@pytest.mark.asyncio
|
| 471 |
async def test_reingest_images_skips_existing_pages(async_client, db_session):
|
backend/tests/test_api_models.py
CHANGED
|
@@ -94,7 +94,7 @@ async def test_get_models_endpoint_removed(async_client):
|
|
| 94 |
@pytest.mark.asyncio
|
| 95 |
async def test_refresh_models_ok(async_client, monkeypatch):
|
| 96 |
monkeypatch.setattr(
|
| 97 |
-
|
| 98 |
)
|
| 99 |
response = await async_client.post("/api/v1/models/refresh")
|
| 100 |
assert response.status_code == 200
|
|
@@ -103,7 +103,7 @@ async def test_refresh_models_ok(async_client, monkeypatch):
|
|
| 103 |
@pytest.mark.asyncio
|
| 104 |
async def test_refresh_models_has_timestamp(async_client, monkeypatch):
|
| 105 |
monkeypatch.setattr(
|
| 106 |
-
|
| 107 |
)
|
| 108 |
data = (await async_client.post("/api/v1/models/refresh")).json()
|
| 109 |
assert "refreshed_at" in data
|
|
@@ -113,7 +113,7 @@ async def test_refresh_models_has_timestamp(async_client, monkeypatch):
|
|
| 113 |
@pytest.mark.asyncio
|
| 114 |
async def test_refresh_models_count(async_client, monkeypatch):
|
| 115 |
monkeypatch.setattr(
|
| 116 |
-
|
| 117 |
)
|
| 118 |
data = (await async_client.post("/api/v1/models/refresh")).json()
|
| 119 |
assert data["count"] == 2
|
|
@@ -123,7 +123,7 @@ async def test_refresh_models_count(async_client, monkeypatch):
|
|
| 123 |
@pytest.mark.asyncio
|
| 124 |
async def test_refresh_models_structure(async_client, monkeypatch):
|
| 125 |
monkeypatch.setattr(
|
| 126 |
-
|
| 127 |
)
|
| 128 |
data = (await async_client.post("/api/v1/models/refresh")).json()
|
| 129 |
assert "models" in data
|
|
|
|
| 94 |
@pytest.mark.asyncio
|
| 95 |
async def test_refresh_models_ok(async_client, monkeypatch):
|
| 96 |
monkeypatch.setattr(
|
| 97 |
+
"app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
|
| 98 |
)
|
| 99 |
response = await async_client.post("/api/v1/models/refresh")
|
| 100 |
assert response.status_code == 200
|
|
|
|
| 103 |
@pytest.mark.asyncio
|
| 104 |
async def test_refresh_models_has_timestamp(async_client, monkeypatch):
|
| 105 |
monkeypatch.setattr(
|
| 106 |
+
"app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
|
| 107 |
)
|
| 108 |
data = (await async_client.post("/api/v1/models/refresh")).json()
|
| 109 |
assert "refreshed_at" in data
|
|
|
|
| 113 |
@pytest.mark.asyncio
|
| 114 |
async def test_refresh_models_count(async_client, monkeypatch):
|
| 115 |
monkeypatch.setattr(
|
| 116 |
+
"app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
|
| 117 |
)
|
| 118 |
data = (await async_client.post("/api/v1/models/refresh")).json()
|
| 119 |
assert data["count"] == 2
|
|
|
|
| 123 |
@pytest.mark.asyncio
|
| 124 |
async def test_refresh_models_structure(async_client, monkeypatch):
|
| 125 |
monkeypatch.setattr(
|
| 126 |
+
"app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
|
| 127 |
)
|
| 128 |
data = (await async_client.post("/api/v1/models/refresh")).json()
|
| 129 |
assert "models" in data
|
backend/tests/test_api_pages.py
CHANGED
|
@@ -87,7 +87,7 @@ def _make_master_json(page_id: str, corpus_profile: str = "medieval-illuminated"
|
|
| 87 |
"folio_label": "f001r",
|
| 88 |
"sequence": 1,
|
| 89 |
"image": {
|
| 90 |
-
"
|
| 91 |
"derivative_web": "/data/deriv/f001r.jpg",
|
| 92 |
"thumbnail": "/data/thumb/f001r.jpg",
|
| 93 |
"width": 1500,
|
|
|
|
| 87 |
"folio_label": "f001r",
|
| 88 |
"sequence": 1,
|
| 89 |
"image": {
|
| 90 |
+
"master": "https://example.com/f001r.jpg",
|
| 91 |
"derivative_web": "/data/deriv/f001r.jpg",
|
| 92 |
"thumbnail": "/data/thumb/f001r.jpg",
|
| 93 |
"width": 1500,
|
backend/tests/test_api_providers.py
CHANGED
|
@@ -90,7 +90,7 @@ _MOCK_MISTRAL_MODELS = [
|
|
| 90 |
|
| 91 |
@pytest.mark.asyncio
|
| 92 |
async def test_list_providers_returns_list(async_client, monkeypatch):
|
| 93 |
-
monkeypatch.setattr(
|
| 94 |
resp = await async_client.get("/api/v1/providers")
|
| 95 |
assert resp.status_code == 200
|
| 96 |
assert isinstance(resp.json(), list)
|
|
@@ -98,14 +98,14 @@ async def test_list_providers_returns_list(async_client, monkeypatch):
|
|
| 98 |
|
| 99 |
@pytest.mark.asyncio
|
| 100 |
async def test_list_providers_count(async_client, monkeypatch):
|
| 101 |
-
monkeypatch.setattr(
|
| 102 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 103 |
assert len(data) == 4 # 4 providers connus
|
| 104 |
|
| 105 |
|
| 106 |
@pytest.mark.asyncio
|
| 107 |
async def test_list_providers_fields(async_client, monkeypatch):
|
| 108 |
-
monkeypatch.setattr(
|
| 109 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 110 |
p = data[0]
|
| 111 |
assert "provider_type" in p
|
|
@@ -116,7 +116,7 @@ async def test_list_providers_fields(async_client, monkeypatch):
|
|
| 116 |
|
| 117 |
@pytest.mark.asyncio
|
| 118 |
async def test_list_providers_all_unavailable(async_client, monkeypatch):
|
| 119 |
-
monkeypatch.setattr(
|
| 120 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 121 |
assert all(not p["available"] for p in data)
|
| 122 |
assert all(p["model_count"] == 0 for p in data)
|
|
@@ -124,7 +124,7 @@ async def test_list_providers_all_unavailable(async_client, monkeypatch):
|
|
| 124 |
|
| 125 |
@pytest.mark.asyncio
|
| 126 |
async def test_list_providers_google_available(async_client, monkeypatch):
|
| 127 |
-
monkeypatch.setattr(
|
| 128 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 129 |
google = next(p for p in data if p["provider_type"] == "google_ai_studio")
|
| 130 |
assert google["available"] is True
|
|
@@ -133,7 +133,7 @@ async def test_list_providers_google_available(async_client, monkeypatch):
|
|
| 133 |
|
| 134 |
@pytest.mark.asyncio
|
| 135 |
async def test_list_providers_mistral_available(async_client, monkeypatch):
|
| 136 |
-
monkeypatch.setattr(
|
| 137 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 138 |
mistral = next(p for p in data if p["provider_type"] == "mistral")
|
| 139 |
assert mistral["available"] is True
|
|
@@ -143,7 +143,7 @@ async def test_list_providers_mistral_available(async_client, monkeypatch):
|
|
| 143 |
@pytest.mark.asyncio
|
| 144 |
async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
|
| 145 |
"""Mistral est toujours dans la liste mΓͺme si indisponible."""
|
| 146 |
-
monkeypatch.setattr(
|
| 147 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 148 |
types_ = [p["provider_type"] for p in data]
|
| 149 |
assert "mistral" in types_
|
|
@@ -156,7 +156,7 @@ async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
|
|
| 156 |
@pytest.mark.asyncio
|
| 157 |
async def test_get_provider_models_google(async_client, monkeypatch):
|
| 158 |
monkeypatch.setattr(
|
| 159 |
-
|
| 160 |
)
|
| 161 |
resp = await async_client.get("/api/v1/providers/google_ai_studio/models")
|
| 162 |
assert resp.status_code == 200
|
|
@@ -166,7 +166,7 @@ async def test_get_provider_models_google(async_client, monkeypatch):
|
|
| 166 |
@pytest.mark.asyncio
|
| 167 |
async def test_get_provider_models_mistral(async_client, monkeypatch):
|
| 168 |
monkeypatch.setattr(
|
| 169 |
-
|
| 170 |
)
|
| 171 |
resp = await async_client.get("/api/v1/providers/mistral/models")
|
| 172 |
assert resp.status_code == 200
|
|
@@ -189,7 +189,7 @@ async def test_get_provider_models_not_configured(async_client, monkeypatch):
|
|
| 189 |
def _raise(ptype):
|
| 190 |
raise RuntimeError("Variable d'environnement manquante : MISTRAL_API_KEY")
|
| 191 |
|
| 192 |
-
monkeypatch.setattr(
|
| 193 |
resp = await async_client.get("/api/v1/providers/mistral/models")
|
| 194 |
assert resp.status_code == 503
|
| 195 |
|
|
@@ -197,7 +197,7 @@ async def test_get_provider_models_not_configured(async_client, monkeypatch):
|
|
| 197 |
@pytest.mark.asyncio
|
| 198 |
async def test_get_provider_models_fields(async_client, monkeypatch):
|
| 199 |
monkeypatch.setattr(
|
| 200 |
-
|
| 201 |
)
|
| 202 |
data = (await async_client.get("/api/v1/providers/mistral/models")).json()
|
| 203 |
m = data[0]
|
|
|
|
| 90 |
|
| 91 |
@pytest.mark.asyncio
|
| 92 |
async def test_list_providers_returns_list(async_client, monkeypatch):
|
| 93 |
+
monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
|
| 94 |
resp = await async_client.get("/api/v1/providers")
|
| 95 |
assert resp.status_code == 200
|
| 96 |
assert isinstance(resp.json(), list)
|
|
|
|
| 98 |
|
| 99 |
@pytest.mark.asyncio
|
| 100 |
async def test_list_providers_count(async_client, monkeypatch):
|
| 101 |
+
monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
|
| 102 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 103 |
assert len(data) == 4 # 4 providers connus
|
| 104 |
|
| 105 |
|
| 106 |
@pytest.mark.asyncio
|
| 107 |
async def test_list_providers_fields(async_client, monkeypatch):
|
| 108 |
+
monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
|
| 109 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 110 |
p = data[0]
|
| 111 |
assert "provider_type" in p
|
|
|
|
| 116 |
|
| 117 |
@pytest.mark.asyncio
|
| 118 |
async def test_list_providers_all_unavailable(async_client, monkeypatch):
|
| 119 |
+
monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
|
| 120 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 121 |
assert all(not p["available"] for p in data)
|
| 122 |
assert all(p["model_count"] == 0 for p in data)
|
|
|
|
| 124 |
|
| 125 |
@pytest.mark.asyncio
|
| 126 |
async def test_list_providers_google_available(async_client, monkeypatch):
|
| 127 |
+
monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_GOOGLE_ONLY)
|
| 128 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 129 |
google = next(p for p in data if p["provider_type"] == "google_ai_studio")
|
| 130 |
assert google["available"] is True
|
|
|
|
| 133 |
|
| 134 |
@pytest.mark.asyncio
|
| 135 |
async def test_list_providers_mistral_available(async_client, monkeypatch):
|
| 136 |
+
monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_GOOGLE_AND_MISTRAL)
|
| 137 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 138 |
mistral = next(p for p in data if p["provider_type"] == "mistral")
|
| 139 |
assert mistral["available"] is True
|
|
|
|
| 143 |
@pytest.mark.asyncio
|
| 144 |
async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
|
| 145 |
"""Mistral est toujours dans la liste mΓͺme si indisponible."""
|
| 146 |
+
monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
|
| 147 |
data = (await async_client.get("/api/v1/providers")).json()
|
| 148 |
types_ = [p["provider_type"] for p in data]
|
| 149 |
assert "mistral" in types_
|
|
|
|
| 156 |
@pytest.mark.asyncio
|
| 157 |
async def test_get_provider_models_google(async_client, monkeypatch):
|
| 158 |
monkeypatch.setattr(
|
| 159 |
+
"app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_GOOGLE_MODELS
|
| 160 |
)
|
| 161 |
resp = await async_client.get("/api/v1/providers/google_ai_studio/models")
|
| 162 |
assert resp.status_code == 200
|
|
|
|
| 166 |
@pytest.mark.asyncio
|
| 167 |
async def test_get_provider_models_mistral(async_client, monkeypatch):
|
| 168 |
monkeypatch.setattr(
|
| 169 |
+
"app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
|
| 170 |
)
|
| 171 |
resp = await async_client.get("/api/v1/providers/mistral/models")
|
| 172 |
assert resp.status_code == 200
|
|
|
|
| 189 |
def _raise(ptype):
|
| 190 |
raise RuntimeError("Variable d'environnement manquante : MISTRAL_API_KEY")
|
| 191 |
|
| 192 |
+
monkeypatch.setattr("app.services.ai.model_registry.list_models_for_provider", _raise)
|
| 193 |
resp = await async_client.get("/api/v1/providers/mistral/models")
|
| 194 |
assert resp.status_code == 503
|
| 195 |
|
|
|
|
| 197 |
@pytest.mark.asyncio
|
| 198 |
async def test_get_provider_models_fields(async_client, monkeypatch):
|
| 199 |
monkeypatch.setattr(
|
| 200 |
+
"app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
|
| 201 |
)
|
| 202 |
data = (await async_client.get("/api/v1/providers/mistral/models")).json()
|
| 203 |
m = data[0]
|
backend/tests/test_api_search.py
CHANGED
|
@@ -33,7 +33,7 @@ def _make_master(page_id: str, diplomatic_text: str = "", translation_fr: str =
|
|
| 33 |
"manuscript_id": "ms-test",
|
| 34 |
"folio_label": "f001r",
|
| 35 |
"sequence": 1,
|
| 36 |
-
"image": {"
|
| 37 |
"layout": {"regions": []},
|
| 38 |
"ocr": {
|
| 39 |
"diplomatic_text": diplomatic_text,
|
|
|
|
| 33 |
"manuscript_id": "ms-test",
|
| 34 |
"folio_label": "f001r",
|
| 35 |
"sequence": 1,
|
| 36 |
+
"image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
|
| 37 |
"layout": {"regions": []},
|
| 38 |
"ocr": {
|
| 39 |
"diplomatic_text": diplomatic_text,
|
backend/tests/test_export_alto.py
CHANGED
|
@@ -52,10 +52,11 @@ def _make_master(
|
|
| 52 |
processing = None
|
| 53 |
if with_processing:
|
| 54 |
processing = ProcessingInfo(
|
|
|
|
| 55 |
model_id="gemini-2.0-flash",
|
| 56 |
model_display_name="Gemini 2.0 Flash",
|
| 57 |
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
|
| 58 |
-
raw_response_path="/data/
|
| 59 |
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
|
| 60 |
)
|
| 61 |
return PageMaster(
|
|
@@ -65,7 +66,7 @@ def _make_master(
|
|
| 65 |
folio_label="0001r",
|
| 66 |
sequence=sequence,
|
| 67 |
image={
|
| 68 |
-
"
|
| 69 |
"derivative_web": "/data/deriv.jpg",
|
| 70 |
"thumbnail": "/data/thumb.jpg",
|
| 71 |
"width": width,
|
|
|
|
| 52 |
processing = None
|
| 53 |
if with_processing:
|
| 54 |
processing = ProcessingInfo(
|
| 55 |
+
provider="google_ai_studio",
|
| 56 |
model_id="gemini-2.0-flash",
|
| 57 |
model_display_name="Gemini 2.0 Flash",
|
| 58 |
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
|
| 59 |
+
raw_response_path="/data/ai_raw.json",
|
| 60 |
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
|
| 61 |
)
|
| 62 |
return PageMaster(
|
|
|
|
| 66 |
folio_label="0001r",
|
| 67 |
sequence=sequence,
|
| 68 |
image={
|
| 69 |
+
"master": "https://example.com/img.jpg",
|
| 70 |
"derivative_web": "/data/deriv.jpg",
|
| 71 |
"thumbnail": "/data/thumb.jpg",
|
| 72 |
"width": width,
|
backend/tests/test_export_iiif.py
CHANGED
|
@@ -53,7 +53,7 @@ def _make_page(
|
|
| 53 |
folio_label=folio_label,
|
| 54 |
sequence=sequence,
|
| 55 |
image={
|
| 56 |
-
"
|
| 57 |
"derivative_web": f"/data/deriv/{folio_label}.jpg",
|
| 58 |
"thumbnail": f"/data/thumb/{folio_label}.jpg",
|
| 59 |
"width": width,
|
|
@@ -193,11 +193,11 @@ def test_manifest_label_uses_language_key(simple_manifest):
|
|
| 193 |
|
| 194 |
|
| 195 |
def test_manifest_label_without_language_uses_none():
|
| 196 |
-
"""Sans champ language, la clΓ© de label est '
|
| 197 |
pages = [_make_page("ms-0001r", "0001r", 1)]
|
| 198 |
-
meta = _base_meta() # pas de language
|
| 199 |
manifest = generate_manifest(pages, meta, _BASE_URL)
|
| 200 |
-
assert "
|
| 201 |
|
| 202 |
|
| 203 |
def test_manifest_label_fr(chroniques_pages, chroniques_meta):
|
|
@@ -272,7 +272,7 @@ def test_canvas_order_respects_sequence():
|
|
| 272 |
_make_page("ms-f002r", "f002r", 2),
|
| 273 |
]
|
| 274 |
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
|
| 275 |
-
labels = [c["label"]["
|
| 276 |
assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
|
| 277 |
|
| 278 |
|
|
@@ -283,7 +283,7 @@ def test_canvas_order_large_sequence():
|
|
| 283 |
random.shuffle(pages)
|
| 284 |
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
|
| 285 |
sequences_in_label = [
|
| 286 |
-
int(c["label"]["
|
| 287 |
for c in manifest["items"]
|
| 288 |
]
|
| 289 |
assert sequences_in_label == list(range(1, 11))
|
|
@@ -344,7 +344,7 @@ def test_canvas_width_matches_image(beatus_pages, beatus_meta):
|
|
| 344 |
# Trouve la page correspondante
|
| 345 |
page_id = canvas["id"].split("/canvas/")[-1]
|
| 346 |
page = next(p for p in beatus_pages if p.page_id == page_id)
|
| 347 |
-
assert canvas["width"] == page.image
|
| 348 |
|
| 349 |
|
| 350 |
def test_canvas_height_matches_image(beatus_pages, beatus_meta):
|
|
@@ -352,7 +352,7 @@ def test_canvas_height_matches_image(beatus_pages, beatus_meta):
|
|
| 352 |
for canvas in manifest["items"]:
|
| 353 |
page_id = canvas["id"].split("/canvas/")[-1]
|
| 354 |
page = next(p for p in beatus_pages if p.page_id == page_id)
|
| 355 |
-
assert canvas["height"] == page.image
|
| 356 |
|
| 357 |
|
| 358 |
def test_canvas_dimensions_beatus_hr():
|
|
@@ -447,7 +447,7 @@ def test_annotation_body_id_is_original_url(beatus_pages, beatus_meta):
|
|
| 447 |
page_id = canvas["id"].split("/canvas/")[-1]
|
| 448 |
page = next(p for p in beatus_pages if p.page_id == page_id)
|
| 449 |
body = canvas["items"][0]["items"][0]["body"]
|
| 450 |
-
assert body["id"] == page.image
|
| 451 |
|
| 452 |
|
| 453 |
def test_annotation_body_contains_gallica_url(beatus_pages, beatus_meta):
|
|
@@ -480,7 +480,10 @@ def test_base_url_trailing_slash_stripped():
|
|
| 480 |
"""Un base_url avec slash final ne génère pas de double slash dans les IDs."""
|
| 481 |
pages = [_make_page("ms-0001r", "0001r", 1)]
|
| 482 |
manifest = generate_manifest(pages, _base_meta(), "https://example.com/")
|
| 483 |
-
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
|
| 486 |
# ---------------------------------------------------------------------------
|
|
|
|
| 53 |
folio_label=folio_label,
|
| 54 |
sequence=sequence,
|
| 55 |
image={
|
| 56 |
+
"master": original_url or f"https://example.com/{folio_label}.jpg",
|
| 57 |
"derivative_web": f"/data/deriv/{folio_label}.jpg",
|
| 58 |
"thumbnail": f"/data/thumb/{folio_label}.jpg",
|
| 59 |
"width": width,
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
def test_manifest_label_without_language_uses_none():
|
| 196 |
+
"""Sans champ language, la clΓ© de label est 'en' (dΓ©faut IIIF-compliant)."""
|
| 197 |
pages = [_make_page("ms-0001r", "0001r", 1)]
|
| 198 |
+
meta = _base_meta() # pas de language β dΓ©faut "en"
|
| 199 |
manifest = generate_manifest(pages, meta, _BASE_URL)
|
| 200 |
+
assert "en" in manifest["label"]
|
| 201 |
|
| 202 |
|
| 203 |
def test_manifest_label_fr(chroniques_pages, chroniques_meta):
|
|
|
|
| 272 |
_make_page("ms-f002r", "f002r", 2),
|
| 273 |
]
|
| 274 |
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
|
| 275 |
+
labels = [c["label"]["en"][0] for c in manifest["items"]]
|
| 276 |
assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
|
| 277 |
|
| 278 |
|
|
|
|
| 283 |
random.shuffle(pages)
|
| 284 |
manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
|
| 285 |
sequences_in_label = [
|
| 286 |
+
int(c["label"]["en"][0].replace("Folio f", "").replace("r", ""))
|
| 287 |
for c in manifest["items"]
|
| 288 |
]
|
| 289 |
assert sequences_in_label == list(range(1, 11))
|
|
|
|
| 344 |
# Trouve la page correspondante
|
| 345 |
page_id = canvas["id"].split("/canvas/")[-1]
|
| 346 |
page = next(p for p in beatus_pages if p.page_id == page_id)
|
| 347 |
+
assert canvas["width"] == page.image.width
|
| 348 |
|
| 349 |
|
| 350 |
def test_canvas_height_matches_image(beatus_pages, beatus_meta):
|
|
|
|
| 352 |
for canvas in manifest["items"]:
|
| 353 |
page_id = canvas["id"].split("/canvas/")[-1]
|
| 354 |
page = next(p for p in beatus_pages if p.page_id == page_id)
|
| 355 |
+
assert canvas["height"] == page.image.height
|
| 356 |
|
| 357 |
|
| 358 |
def test_canvas_dimensions_beatus_hr():
|
|
|
|
| 447 |
page_id = canvas["id"].split("/canvas/")[-1]
|
| 448 |
page = next(p for p in beatus_pages if p.page_id == page_id)
|
| 449 |
body = canvas["items"][0]["items"][0]["body"]
|
| 450 |
+
assert body["id"] == page.image.master
|
| 451 |
|
| 452 |
|
| 453 |
def test_annotation_body_contains_gallica_url(beatus_pages, beatus_meta):
|
|
|
|
| 480 |
"""Un base_url avec slash final ne génère pas de double slash dans les IDs."""
|
| 481 |
pages = [_make_page("ms-0001r", "0001r", 1)]
|
| 482 |
manifest = generate_manifest(pages, _base_meta(), "https://example.com/")
|
| 483 |
+
manifest_id = manifest["id"]
|
| 484 |
+
# Retirer le protocole puis vΓ©rifier qu'il n'y a pas de double slash
|
| 485 |
+
without_protocol = manifest_id.split("://", 1)[1]
|
| 486 |
+
assert "//" not in without_protocol
|
| 487 |
|
| 488 |
|
| 489 |
# ---------------------------------------------------------------------------
|
backend/tests/test_export_mets.py
CHANGED
|
@@ -66,10 +66,11 @@ def _make_page(
|
|
| 66 |
processing = None
|
| 67 |
if with_processing:
|
| 68 |
processing = ProcessingInfo(
|
|
|
|
| 69 |
model_id="gemini-2.0-flash",
|
| 70 |
model_display_name="Gemini 2.0 Flash",
|
| 71 |
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
|
| 72 |
-
raw_response_path=f"/data/corpora/test/pages/{folio_label}/
|
| 73 |
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
|
| 74 |
)
|
| 75 |
ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
|
|
@@ -80,7 +81,7 @@ def _make_page(
|
|
| 80 |
folio_label=folio_label,
|
| 81 |
sequence=sequence,
|
| 82 |
image={
|
| 83 |
-
"
|
| 84 |
"derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg",
|
| 85 |
"thumbnail": f"/data/thumb/{folio_label}.jpg",
|
| 86 |
"width": 1500,
|
|
@@ -194,7 +195,9 @@ def test_generate_mets_namespace(beatus_pages, beatus_meta):
|
|
| 194 |
|
| 195 |
def test_generate_mets_objid(beatus_pages, beatus_meta):
|
| 196 |
root = _parse(generate_mets(beatus_pages, beatus_meta))
|
| 197 |
-
|
|
|
|
|
|
|
| 198 |
|
| 199 |
|
| 200 |
def test_generate_mets_label(beatus_pages, beatus_meta):
|
|
|
|
| 66 |
processing = None
|
| 67 |
if with_processing:
|
| 68 |
processing = ProcessingInfo(
|
| 69 |
+
provider="google_ai_studio",
|
| 70 |
model_id="gemini-2.0-flash",
|
| 71 |
model_display_name="Gemini 2.0 Flash",
|
| 72 |
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
|
| 73 |
+
raw_response_path=f"/data/corpora/test/pages/{folio_label}/ai_raw.json",
|
| 74 |
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
|
| 75 |
)
|
| 76 |
ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
|
|
|
|
| 81 |
folio_label=folio_label,
|
| 82 |
sequence=sequence,
|
| 83 |
image={
|
| 84 |
+
"master": original_url or f"https://example.com/{folio_label}.jpg",
|
| 85 |
"derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg",
|
| 86 |
"thumbnail": f"/data/thumb/{folio_label}.jpg",
|
| 87 |
"width": 1500,
|
|
|
|
| 195 |
|
| 196 |
def test_generate_mets_objid(beatus_pages, beatus_meta):
|
| 197 |
root = _parse(generate_mets(beatus_pages, beatus_meta))
|
| 198 |
+
objid = root.get("OBJID")
|
| 199 |
+
assert objid is not None, "OBJID attribute absent du root mets"
|
| 200 |
+
assert objid == "BnF-Latin-8878"
|
| 201 |
|
| 202 |
|
| 203 |
def test_generate_mets_label(beatus_pages, beatus_meta):
|
backend/tests/test_image_pipeline.py
CHANGED
|
@@ -278,7 +278,6 @@ def test_fetch_iiif_image_success():
|
|
| 278 |
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
|
| 279 |
),
|
| 280 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
| 281 |
-
"Referer": "https://gallica.bnf.fr/",
|
| 282 |
},
|
| 283 |
follow_redirects=True,
|
| 284 |
timeout=60.0,
|
|
|
|
| 278 |
"+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
|
| 279 |
),
|
| 280 |
"Accept": "image/jpeg,image/png,image/*,*/*",
|
|
|
|
| 281 |
},
|
| 282 |
follow_redirects=True,
|
| 283 |
timeout=60.0,
|
backend/tests/test_job_runner.py
CHANGED
|
@@ -142,16 +142,24 @@ def _page_master(page_id: str, ms_id: str) -> PageMaster:
|
|
| 142 |
|
| 143 |
|
| 144 |
def _apply_success_mocks(monkeypatch, page_id: str, ms_id: str) -> None:
|
| 145 |
-
"""Applique les mocks IO pour un pipeline rΓ©ussi.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
monkeypatch.setattr(
|
| 147 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 148 |
)
|
| 149 |
monkeypatch.setattr(
|
| 150 |
-
|
| 151 |
lambda **kw: _page_master(page_id, ms_id),
|
| 152 |
)
|
| 153 |
-
monkeypatch.setattr(
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
# ---------------------------------------------------------------------------
|
|
@@ -274,7 +282,7 @@ async def test_no_image_path_job_failed(db, setup_with_model, monkeypatch):
|
|
| 274 |
s["page"].image_master_path = None
|
| 275 |
await db.commit()
|
| 276 |
monkeypatch.setattr(
|
| 277 |
-
|
| 278 |
lambda **kw: _page_master(s["page"].id, s["ms"].id),
|
| 279 |
)
|
| 280 |
|
|
@@ -291,7 +299,7 @@ async def test_no_image_path_page_error(db, setup_with_model, monkeypatch):
|
|
| 291 |
s["page"].image_master_path = None
|
| 292 |
await db.commit()
|
| 293 |
monkeypatch.setattr(
|
| 294 |
-
|
| 295 |
lambda **kw: _page_master(s["page"].id, s["ms"].id),
|
| 296 |
)
|
| 297 |
|
|
@@ -343,7 +351,7 @@ async def test_primary_analysis_fails_job_failed(db, setup_with_model, monkeypat
|
|
| 343 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 344 |
)
|
| 345 |
monkeypatch.setattr(
|
| 346 |
-
|
| 347 |
lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
|
| 348 |
)
|
| 349 |
|
|
@@ -361,7 +369,7 @@ async def test_primary_analysis_fails_page_error(db, setup_with_model, monkeypat
|
|
| 361 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 362 |
)
|
| 363 |
monkeypatch.setattr(
|
| 364 |
-
|
| 365 |
lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
|
| 366 |
)
|
| 367 |
|
|
@@ -379,7 +387,7 @@ async def test_primary_analysis_error_message_stored(db, setup_with_model, monke
|
|
| 379 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 380 |
)
|
| 381 |
monkeypatch.setattr(
|
| 382 |
-
|
| 383 |
lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
|
| 384 |
)
|
| 385 |
|
|
@@ -401,12 +409,14 @@ async def test_write_alto_fails_job_failed(db, setup_with_model, monkeypatch):
|
|
| 401 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 402 |
)
|
| 403 |
monkeypatch.setattr(
|
| 404 |
-
|
| 405 |
lambda **kw: _page_master(s["page"].id, s["ms"].id),
|
| 406 |
)
|
| 407 |
-
monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "<alto/>")
|
| 408 |
monkeypatch.setattr(
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
| 410 |
lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
|
| 411 |
)
|
| 412 |
|
|
@@ -424,12 +434,14 @@ async def test_write_alto_fails_page_error(db, setup_with_model, monkeypatch):
|
|
| 424 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 425 |
)
|
| 426 |
monkeypatch.setattr(
|
| 427 |
-
|
| 428 |
lambda **kw: _page_master(s["page"].id, s["ms"].id),
|
| 429 |
)
|
| 430 |
-
monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "<alto/>")
|
| 431 |
monkeypatch.setattr(
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
| 433 |
lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
|
| 434 |
)
|
| 435 |
|
|
@@ -534,7 +546,7 @@ async def test_corpus_runner_calls_execute_per_job(monkeypatch):
|
|
| 534 |
return _FakeSession()
|
| 535 |
|
| 536 |
monkeypatch.setattr(corpus_runner_module, "async_session_factory", _mock_factory)
|
| 537 |
-
monkeypatch.setattr(
|
| 538 |
|
| 539 |
await execute_corpus_job("corpus-xyz")
|
| 540 |
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
def _apply_success_mocks(monkeypatch, page_id: str, ms_id: str) -> None:
|
| 145 |
+
"""Applique les mocks IO pour un pipeline rΓ©ussi.
|
| 146 |
+
|
| 147 |
+
Les imports sont diffΓ©rΓ©s dans job_runner (lazy imports). On patche donc
|
| 148 |
+
les modules sources pour que le import dans la fonction cible récupère le mock.
|
| 149 |
+
"""
|
| 150 |
monkeypatch.setattr(
|
| 151 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 152 |
)
|
| 153 |
monkeypatch.setattr(
|
| 154 |
+
"app.services.ai.analyzer.run_primary_analysis",
|
| 155 |
lambda **kw: _page_master(page_id, ms_id),
|
| 156 |
)
|
| 157 |
+
monkeypatch.setattr(
|
| 158 |
+
"app.services.export.alto.generate_alto", lambda pm: "<alto/>"
|
| 159 |
+
)
|
| 160 |
+
monkeypatch.setattr(
|
| 161 |
+
"app.services.export.alto.write_alto", lambda xml, path: None
|
| 162 |
+
)
|
| 163 |
|
| 164 |
|
| 165 |
# ---------------------------------------------------------------------------
|
|
|
|
| 282 |
s["page"].image_master_path = None
|
| 283 |
await db.commit()
|
| 284 |
monkeypatch.setattr(
|
| 285 |
+
"app.services.ai.analyzer.run_primary_analysis",
|
| 286 |
lambda **kw: _page_master(s["page"].id, s["ms"].id),
|
| 287 |
)
|
| 288 |
|
|
|
|
| 299 |
s["page"].image_master_path = None
|
| 300 |
await db.commit()
|
| 301 |
monkeypatch.setattr(
|
| 302 |
+
"app.services.ai.analyzer.run_primary_analysis",
|
| 303 |
lambda **kw: _page_master(s["page"].id, s["ms"].id),
|
| 304 |
)
|
| 305 |
|
|
|
|
| 351 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 352 |
)
|
| 353 |
monkeypatch.setattr(
|
| 354 |
+
"app.services.ai.analyzer.run_primary_analysis",
|
| 355 |
lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
|
| 356 |
)
|
| 357 |
|
|
|
|
| 369 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 370 |
)
|
| 371 |
monkeypatch.setattr(
|
| 372 |
+
"app.services.ai.analyzer.run_primary_analysis",
|
| 373 |
lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
|
| 374 |
)
|
| 375 |
|
|
|
|
| 387 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 388 |
)
|
| 389 |
monkeypatch.setattr(
|
| 390 |
+
"app.services.ai.analyzer.run_primary_analysis",
|
| 391 |
lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
|
| 392 |
)
|
| 393 |
|
|
|
|
| 409 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 410 |
)
|
| 411 |
monkeypatch.setattr(
|
| 412 |
+
"app.services.ai.analyzer.run_primary_analysis",
|
| 413 |
lambda **kw: _page_master(s["page"].id, s["ms"].id),
|
| 414 |
)
|
|
|
|
| 415 |
monkeypatch.setattr(
|
| 416 |
+
"app.services.export.alto.generate_alto", lambda pm: "<alto/>"
|
| 417 |
+
)
|
| 418 |
+
monkeypatch.setattr(
|
| 419 |
+
"app.services.export.alto.write_alto",
|
| 420 |
lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
|
| 421 |
)
|
| 422 |
|
|
|
|
| 434 |
job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
|
| 435 |
)
|
| 436 |
monkeypatch.setattr(
|
| 437 |
+
"app.services.ai.analyzer.run_primary_analysis",
|
| 438 |
lambda **kw: _page_master(s["page"].id, s["ms"].id),
|
| 439 |
)
|
|
|
|
| 440 |
monkeypatch.setattr(
|
| 441 |
+
"app.services.export.alto.generate_alto", lambda pm: "<alto/>"
|
| 442 |
+
)
|
| 443 |
+
monkeypatch.setattr(
|
| 444 |
+
"app.services.export.alto.write_alto",
|
| 445 |
lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
|
| 446 |
)
|
| 447 |
|
|
|
|
| 546 |
return _FakeSession()
|
| 547 |
|
| 548 |
monkeypatch.setattr(corpus_runner_module, "async_session_factory", _mock_factory)
|
| 549 |
+
monkeypatch.setattr("app.services.job_runner.execute_page_job", _mock_execute)
|
| 550 |
|
| 551 |
await execute_corpus_job("corpus-xyz")
|
| 552 |
|
backend/tests/test_security.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests de sΓ©curitΓ© β Sprint F1.
|
| 3 |
+
|
| 4 |
+
VΓ©rifie que toutes les vulnΓ©rabilitΓ©s identifiΓ©es sont corrigΓ©es :
|
| 5 |
+
- Path traversal sur profiles, slug, folio_label, frontend serving
|
| 6 |
+
- SSRF sur manifest_url
|
| 7 |
+
- Validation des entrΓ©es (taille, format)
|
| 8 |
+
"""
|
| 9 |
+
# 1. stdlib
|
| 10 |
+
import pytest
|
| 11 |
+
|
| 12 |
+
# 2. third-party β fixtures API
|
| 13 |
+
from tests.conftest_api import async_client, db_session # noqa: F401
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
# Path traversal β profiles
|
| 18 |
+
# ---------------------------------------------------------------------------
|
| 19 |
+
|
| 20 |
+
@pytest.mark.asyncio
|
| 21 |
+
async def test_profile_path_traversal_dotdot(async_client):
|
| 22 |
+
"""Un profile_id contenant '..' doit Γͺtre rejetΓ© (400)."""
|
| 23 |
+
resp = await async_client.get("/api/v1/profiles/..passwd")
|
| 24 |
+
assert resp.status_code == 400
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@pytest.mark.asyncio
|
| 28 |
+
async def test_profile_path_traversal_slash(async_client):
|
| 29 |
+
"""Un profile_id avec un slash (mΓͺme encodΓ©) doit Γͺtre rejetΓ© (400 ou 404)."""
|
| 30 |
+
# FastAPI normalise les chemins, donc un slash dans l'ID ne sera pas transmis.
|
| 31 |
+
# On teste avec un ID contenant des caractères spéciaux interdits.
|
| 32 |
+
resp = await async_client.get("/api/v1/profiles/UPPER_CASE")
|
| 33 |
+
assert resp.status_code == 400
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@pytest.mark.asyncio
|
| 37 |
+
async def test_profile_path_traversal_special_chars(async_client):
|
| 38 |
+
"""Un profile_id avec des caractΓ¨res spΓ©ciaux doit Γͺtre rejetΓ©."""
|
| 39 |
+
resp = await async_client.get("/api/v1/profiles/test@profile")
|
| 40 |
+
assert resp.status_code == 400
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@pytest.mark.asyncio
|
| 44 |
+
async def test_profile_valid_id_not_found(async_client):
|
| 45 |
+
"""Un profile_id valide mais inexistant retourne 404 (pas 400)."""
|
| 46 |
+
resp = await async_client.get("/api/v1/profiles/does-not-exist")
|
| 47 |
+
assert resp.status_code == 404
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# ---------------------------------------------------------------------------
|
| 51 |
+
# Path traversal β corpus slug
|
| 52 |
+
# ---------------------------------------------------------------------------
|
| 53 |
+
|
| 54 |
+
@pytest.mark.asyncio
|
| 55 |
+
async def test_corpus_slug_path_traversal(async_client):
|
| 56 |
+
"""Un slug avec ../ doit Γͺtre rejetΓ© par la validation Pydantic."""
|
| 57 |
+
resp = await async_client.post("/api/v1/corpora", json={
|
| 58 |
+
"slug": "../../malicious",
|
| 59 |
+
"title": "Test",
|
| 60 |
+
"profile_id": "medieval-illuminated",
|
| 61 |
+
})
|
| 62 |
+
assert resp.status_code == 422
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@pytest.mark.asyncio
|
| 66 |
+
async def test_corpus_slug_with_spaces(async_client):
|
| 67 |
+
"""Un slug avec des espaces doit Γͺtre rejetΓ©."""
|
| 68 |
+
resp = await async_client.post("/api/v1/corpora", json={
|
| 69 |
+
"slug": "my corpus",
|
| 70 |
+
"title": "Test",
|
| 71 |
+
"profile_id": "medieval-illuminated",
|
| 72 |
+
})
|
| 73 |
+
assert resp.status_code == 422
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@pytest.mark.asyncio
|
| 77 |
+
async def test_corpus_slug_uppercase(async_client):
|
| 78 |
+
"""Un slug avec des majuscules doit Γͺtre rejetΓ© (lowercase only)."""
|
| 79 |
+
resp = await async_client.post("/api/v1/corpora", json={
|
| 80 |
+
"slug": "MyCorpus",
|
| 81 |
+
"title": "Test",
|
| 82 |
+
"profile_id": "medieval-illuminated",
|
| 83 |
+
})
|
| 84 |
+
assert resp.status_code == 422
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
@pytest.mark.asyncio
|
| 88 |
+
async def test_corpus_slug_valid(async_client):
|
| 89 |
+
"""Un slug valide doit Γͺtre acceptΓ©."""
|
| 90 |
+
resp = await async_client.post("/api/v1/corpora", json={
|
| 91 |
+
"slug": "my-corpus-01",
|
| 92 |
+
"title": "Test",
|
| 93 |
+
"profile_id": "medieval-illuminated",
|
| 94 |
+
})
|
| 95 |
+
assert resp.status_code == 201
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@pytest.mark.asyncio
|
| 99 |
+
async def test_corpus_slug_empty(async_client):
|
| 100 |
+
"""Un slug vide doit Γͺtre rejetΓ©."""
|
| 101 |
+
resp = await async_client.post("/api/v1/corpora", json={
|
| 102 |
+
"slug": "",
|
| 103 |
+
"title": "Test",
|
| 104 |
+
"profile_id": "medieval-illuminated",
|
| 105 |
+
})
|
| 106 |
+
assert resp.status_code == 422
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@pytest.mark.asyncio
|
| 110 |
+
async def test_corpus_title_too_long(async_client):
|
| 111 |
+
"""Un titre trop long (>256 chars) doit Γͺtre rejetΓ©."""
|
| 112 |
+
resp = await async_client.post("/api/v1/corpora", json={
|
| 113 |
+
"slug": "test-long",
|
| 114 |
+
"title": "x" * 300,
|
| 115 |
+
"profile_id": "medieval-illuminated",
|
| 116 |
+
})
|
| 117 |
+
assert resp.status_code == 422
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# ---------------------------------------------------------------------------
|
| 121 |
+
# SSRF β manifest_url
|
| 122 |
+
# ---------------------------------------------------------------------------
|
| 123 |
+
|
| 124 |
+
@pytest.mark.asyncio
|
| 125 |
+
async def test_ssrf_localhost(async_client):
|
| 126 |
+
"""Un manifest_url pointant vers localhost doit Γͺtre rejetΓ©."""
|
| 127 |
+
# CrΓ©er un corpus d'abord
|
| 128 |
+
create = await async_client.post("/api/v1/corpora", json={
|
| 129 |
+
"slug": "ssrf-test", "title": "SSRF", "profile_id": "test",
|
| 130 |
+
})
|
| 131 |
+
cid = create.json()["id"]
|
| 132 |
+
|
| 133 |
+
resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
|
| 134 |
+
"manifest_url": "http://localhost:8000/secret",
|
| 135 |
+
})
|
| 136 |
+
assert resp.status_code == 400
|
| 137 |
+
assert "interdit" in resp.json()["detail"].lower() or "localhost" in resp.json()["detail"].lower()
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
@pytest.mark.asyncio
|
| 141 |
+
async def test_ssrf_metadata_ip(async_client):
|
| 142 |
+
"""Un manifest_url vers 169.254.x.x (cloud metadata) doit Γͺtre rejetΓ©."""
|
| 143 |
+
create = await async_client.post("/api/v1/corpora", json={
|
| 144 |
+
"slug": "ssrf-meta", "title": "SSRF", "profile_id": "test",
|
| 145 |
+
})
|
| 146 |
+
cid = create.json()["id"]
|
| 147 |
+
|
| 148 |
+
resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
|
| 149 |
+
"manifest_url": "http://169.254.169.254/latest/meta-data/",
|
| 150 |
+
})
|
| 151 |
+
assert resp.status_code == 400
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
@pytest.mark.asyncio
|
| 155 |
+
async def test_ssrf_file_scheme(async_client):
|
| 156 |
+
"""Un manifest_url avec file:// doit Γͺtre rejetΓ©."""
|
| 157 |
+
create = await async_client.post("/api/v1/corpora", json={
|
| 158 |
+
"slug": "ssrf-file", "title": "SSRF", "profile_id": "test",
|
| 159 |
+
})
|
| 160 |
+
cid = create.json()["id"]
|
| 161 |
+
|
| 162 |
+
resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
|
| 163 |
+
"manifest_url": "file:///etc/passwd",
|
| 164 |
+
})
|
| 165 |
+
assert resp.status_code == 400
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ---------------------------------------------------------------------------
|
| 169 |
+
# Input validation β search
|
| 170 |
+
# ---------------------------------------------------------------------------
|
| 171 |
+
|
| 172 |
+
@pytest.mark.asyncio
|
| 173 |
+
async def test_search_query_too_long(async_client):
|
| 174 |
+
"""Une requΓͺte de recherche >500 chars doit Γͺtre rejetΓ©e."""
|
| 175 |
+
resp = await async_client.get("/api/v1/search", params={"q": "x" * 501})
|
| 176 |
+
assert resp.status_code == 422
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@pytest.mark.asyncio
|
| 180 |
+
async def test_search_query_max_length_ok(async_client):
|
| 181 |
+
"""Une requΓͺte de recherche de 500 chars doit Γͺtre acceptΓ©e (0 rΓ©sultat)."""
|
| 182 |
+
resp = await async_client.get("/api/v1/search", params={"q": "x" * 500})
|
| 183 |
+
assert resp.status_code == 200
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# ---------------------------------------------------------------------------
|
| 187 |
+
# Input validation β model selection
|
| 188 |
+
# ---------------------------------------------------------------------------
|
| 189 |
+
|
| 190 |
+
@pytest.mark.asyncio
|
| 191 |
+
async def test_model_id_too_long(async_client):
|
| 192 |
+
"""Un model_id >256 chars doit Γͺtre rejetΓ©."""
|
| 193 |
+
create = await async_client.post("/api/v1/corpora", json={
|
| 194 |
+
"slug": "model-test", "title": "T", "profile_id": "test",
|
| 195 |
+
})
|
| 196 |
+
cid = create.json()["id"]
|
| 197 |
+
|
| 198 |
+
resp = await async_client.put(f"/api/v1/corpora/{cid}/model", json={
|
| 199 |
+
"model_id": "x" * 300,
|
| 200 |
+
"provider_type": "google_ai_studio",
|
| 201 |
+
})
|
| 202 |
+
assert resp.status_code == 422
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# ---------------------------------------------------------------------------
|
| 206 |
+
# Input validation β corrections
|
| 207 |
+
# ---------------------------------------------------------------------------
|
| 208 |
+
|
| 209 |
+
@pytest.mark.asyncio
|
| 210 |
+
async def test_corrections_restore_negative_version(async_client):
|
| 211 |
+
"""restore_to_version < 1 doit Γͺtre rejetΓ©."""
|
| 212 |
+
resp = await async_client.post("/api/v1/pages/fake-page/corrections", json={
|
| 213 |
+
"restore_to_version": 0,
|
| 214 |
+
})
|
| 215 |
+
assert resp.status_code == 422
|
frontend/src/App.tsx
CHANGED
|
@@ -42,6 +42,7 @@ export default function App() {
|
|
| 42 |
onOpenManuscript={(manuscriptId, profileId) =>
|
| 43 |
setView({ name: 'reader', manuscriptId, profileId })
|
| 44 |
}
|
|
|
|
| 45 |
onAdmin={() => setView({ name: 'admin' })}
|
| 46 |
/>
|
| 47 |
)
|
|
|
|
| 42 |
onOpenManuscript={(manuscriptId, profileId) =>
|
| 43 |
setView({ name: 'reader', manuscriptId, profileId })
|
| 44 |
}
|
| 45 |
+
onOpenPage={(pageId) => setView({ name: 'editor', pageId })}
|
| 46 |
onAdmin={() => setView({ name: 'admin' })}
|
| 47 |
/>
|
| 48 |
)
|
frontend/src/lib/api.ts
CHANGED
|
@@ -1,5 +1,13 @@
|
|
| 1 |
const BASE_URL: string = import.meta.env.VITE_API_URL ?? ''
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
// ββ Types βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
|
| 5 |
export interface ProviderInfo {
|
|
|
|
| 1 |
const BASE_URL: string = import.meta.env.VITE_API_URL ?? ''
|
| 2 |
|
| 3 |
+
if (!BASE_URL && import.meta.env.PROD) {
|
| 4 |
+
console.warn(
|
| 5 |
+
'[Scriptorium] VITE_API_URL non dΓ©fini en production. ' +
|
| 6 |
+
'Les appels API utiliseront des chemins relatifs, ce qui peut Γ©chouer ' +
|
| 7 |
+
'si le frontend n\'est pas servi par le mΓͺme domaine que le backend.'
|
| 8 |
+
)
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
// ββ Types βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
|
| 13 |
export interface ProviderInfo {
|
frontend/src/pages/Editor.tsx
CHANGED
|
@@ -119,7 +119,7 @@ export default function Editor({ pageId, onBack }: Props) {
|
|
| 119 |
return <div className="p-8 text-red-600">Erreur : {error}</div>
|
| 120 |
}
|
| 121 |
|
| 122 |
-
const imageUrl = master ?
|
| 123 |
const regions = master?.layout?.regions ?? []
|
| 124 |
|
| 125 |
return (
|
|
|
|
| 119 |
return <div className="p-8 text-red-600">Erreur : {error}</div>
|
| 120 |
}
|
| 121 |
|
| 122 |
+
const imageUrl = master?.image?.derivative_web ?? master?.image?.master ?? ''
|
| 123 |
const regions = master?.layout?.regions ?? []
|
| 124 |
|
| 125 |
return (
|
frontend/src/pages/Home.tsx
CHANGED
|
@@ -10,10 +10,11 @@ import {
|
|
| 10 |
|
| 11 |
interface Props {
|
| 12 |
onOpenManuscript: (manuscriptId: string, profileId: string) => void
|
|
|
|
| 13 |
onAdmin: () => void
|
| 14 |
}
|
| 15 |
|
| 16 |
-
export default function Home({ onOpenManuscript, onAdmin }: Props) {
|
| 17 |
const [corpora, setCorpora] = useState<Corpus[]>([])
|
| 18 |
const [loading, setLoading] = useState(true)
|
| 19 |
const [error, setError] = useState<string | null>(null)
|
|
@@ -73,7 +74,7 @@ export default function Home({ onOpenManuscript, onAdmin }: Props) {
|
|
| 73 |
</p>
|
| 74 |
</div>
|
| 75 |
<div className="flex items-center gap-4">
|
| 76 |
-
<SearchBar />
|
| 77 |
<AdminNav onClick={onAdmin} />
|
| 78 |
</div>
|
| 79 |
</header>
|
|
|
|
| 10 |
|
| 11 |
interface Props {
|
| 12 |
onOpenManuscript: (manuscriptId: string, profileId: string) => void
|
| 13 |
+
onOpenPage?: (pageId: string) => void
|
| 14 |
onAdmin: () => void
|
| 15 |
}
|
| 16 |
|
| 17 |
+
export default function Home({ onOpenManuscript, onOpenPage, onAdmin }: Props) {
|
| 18 |
const [corpora, setCorpora] = useState<Corpus[]>([])
|
| 19 |
const [loading, setLoading] = useState(true)
|
| 20 |
const [error, setError] = useState<string | null>(null)
|
|
|
|
| 74 |
</p>
|
| 75 |
</div>
|
| 76 |
<div className="flex items-center gap-4">
|
| 77 |
+
<SearchBar onSelectResult={onOpenPage ? (r) => onOpenPage(r.page_id) : undefined} />
|
| 78 |
<AdminNav onClick={onAdmin} />
|
| 79 |
</div>
|
| 80 |
</header>
|
infra/Dockerfile
DELETED
|
@@ -1,71 +0,0 @@
|
|
| 1 |
-
# Scriptorium AI β image de production (multi-stage)
|
| 2 |
-
# Ce fichier est la copie exacte de Dockerfile (racine).
|
| 3 |
-
# Build depuis la racine du dΓ©pΓ΄t :
|
| 4 |
-
# docker build -f infra/Dockerfile -t scriptorium-ai .
|
| 5 |
-
#
|
| 6 |
-
# Structure attendue dans l'image :
|
| 7 |
-
# /app/backend/app/ β source Python (importable via PYTHONPATH)
|
| 8 |
-
# /app/profiles/ β profils JSON
|
| 9 |
-
# /app/prompts/ β templates de prompts
|
| 10 |
-
# /app/static/ β frontend React buildΓ©
|
| 11 |
-
# /app/data/ β créé vide ; Γ monter en volume pour les artefacts
|
| 12 |
-
|
| 13 |
-
# ββ Stage 1 : build du frontend React ββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
-
FROM node:20-slim AS frontend-builder
|
| 15 |
-
|
| 16 |
-
WORKDIR /frontend
|
| 17 |
-
|
| 18 |
-
# Installer les dΓ©pendances (cache layer sΓ©parΓ©)
|
| 19 |
-
COPY frontend/package.json ./
|
| 20 |
-
RUN npm install
|
| 21 |
-
|
| 22 |
-
# Copier les sources et builder
|
| 23 |
-
COPY frontend/ ./
|
| 24 |
-
RUN npm run build
|
| 25 |
-
|
| 26 |
-
# ββ Stage 2 : image Python finale ββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
-
FROM python:3.11-slim
|
| 28 |
-
|
| 29 |
-
WORKDIR /app
|
| 30 |
-
|
| 31 |
-
# ββ DΓ©pendances Python βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
-
# On copie uniquement pyproject.toml pour exploiter le cache de layers Docker.
|
| 33 |
-
# Un stub app/__init__.py satisfait setuptools (discover packages) sans avoir
|
| 34 |
-
# besoin de copier tout le code source Γ ce stade.
|
| 35 |
-
COPY backend/pyproject.toml /tmp/build/
|
| 36 |
-
RUN mkdir -p /tmp/build/app \
|
| 37 |
-
&& touch /tmp/build/app/__init__.py \
|
| 38 |
-
&& pip install --no-cache-dir --upgrade /tmp/build/ \
|
| 39 |
-
&& rm -rf /tmp/build
|
| 40 |
-
|
| 41 |
-
# ββ Layer dΓ©diΓ© mistralai β invalide le cache HF si v0.x est prΓ©sent βββββ
|
| 42 |
-
# Layer sΓ©parΓ© de l'install principal pour forcer la mise Γ jour mΓͺme si
|
| 43 |
-
# HuggingFace rΓ©utilise le layer pyproject.toml depuis un build antΓ©rieur.
|
| 44 |
-
RUN pip install --no-cache-dir 'mistralai>=1.0,<2.0'
|
| 45 |
-
|
| 46 |
-
# ββ Code source backend ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
-
COPY backend/app ./backend/app
|
| 48 |
-
COPY profiles/ ./profiles/
|
| 49 |
-
COPY prompts/ ./prompts/
|
| 50 |
-
|
| 51 |
-
# ββ Frontend buildΓ© ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
-
COPY --from=frontend-builder /frontend/dist ./static
|
| 53 |
-
|
| 54 |
-
# ββ RΓ©pertoire des artefacts (vide dans l'image ; montΓ© en volume) βββββββββ
|
| 55 |
-
RUN mkdir -p /app/data
|
| 56 |
-
|
| 57 |
-
# ββ Secrets Google AI : JAMAIS dans l'image (R06) βββββββββββββββββββββββββ
|
| 58 |
-
# Passer au runtime via -e ou docker-compose environment :
|
| 59 |
-
# AI_PROVIDER, GOOGLE_AI_STUDIO_API_KEY, GOOGLE_AI_API_KEY,
|
| 60 |
-
# GOOGLE_VERTEX_PROJECT, GOOGLE_VERTEX_LOCATION
|
| 61 |
-
|
| 62 |
-
# PYTHONPATH permet l'import `app.main:app` depuis /app/backend/app/
|
| 63 |
-
ENV PYTHONPATH=/app/backend
|
| 64 |
-
ENV PROFILES_DIR=/app/profiles
|
| 65 |
-
ENV PROMPTS_DIR=/app/prompts
|
| 66 |
-
ENV DATA_DIR=/app/data
|
| 67 |
-
|
| 68 |
-
EXPOSE 7860
|
| 69 |
-
|
| 70 |
-
# 1 worker au MVP β pas de Gunicorn, pas de multiprocessing
|
| 71 |
-
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|