maribakulj commited on
Commit
0209857
Β·
unverified Β·
2 Parent(s): 7a648f535a94af

Merge pull request #30 from maribakulj/claude/code-review-analysis-qDhlH

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. Dockerfile +1 -1
  2. backend/app/api/v1/corpora.py +12 -8
  3. backend/app/api/v1/export.py +10 -4
  4. backend/app/api/v1/ingest.py +80 -11
  5. backend/app/api/v1/jobs.py +6 -2
  6. backend/app/api/v1/models_api.py +10 -9
  7. backend/app/api/v1/pages.py +6 -6
  8. backend/app/api/v1/profiles.py +25 -8
  9. backend/app/api/v1/search.py +29 -24
  10. backend/app/config.py +10 -24
  11. backend/app/main.py +5 -4
  12. backend/app/models/corpus.py +7 -2
  13. backend/app/models/job.py +5 -2
  14. backend/app/schemas/page_master.py +24 -5
  15. backend/app/services/ai/__init__.py +23 -11
  16. backend/app/services/ai/analyzer.py +22 -15
  17. backend/app/services/ai/master_writer.py +24 -24
  18. backend/app/services/ai/model_registry.py +18 -5
  19. backend/app/services/ai/prompt_loader.py +6 -0
  20. backend/app/services/ai/provider_google_ai.py +11 -4
  21. backend/app/services/ai/provider_mistral.py +16 -8
  22. backend/app/services/ai/provider_vertex_key.py +0 -3
  23. backend/app/services/ai/provider_vertex_sa.py +11 -4
  24. backend/app/services/corpus_runner.py +2 -1
  25. backend/app/services/export/alto.py +5 -9
  26. backend/app/services/export/iiif.py +5 -5
  27. backend/app/services/export/mets.py +8 -3
  28. backend/app/services/ingest/iiif_fetcher.py +0 -1
  29. backend/app/services/job_runner.py +4 -2
  30. backend/pyproject.toml +1 -0
  31. backend/tests/conftest_api.py +5 -4
  32. backend/tests/test_ai_analyzer.py +26 -25
  33. backend/tests/test_api_corrections.py +10 -3
  34. backend/tests/test_api_export.py +1 -1
  35. backend/tests/test_api_ingest.py +9 -0
  36. backend/tests/test_api_models.py +4 -4
  37. backend/tests/test_api_pages.py +1 -1
  38. backend/tests/test_api_providers.py +11 -11
  39. backend/tests/test_api_search.py +1 -1
  40. backend/tests/test_export_alto.py +3 -2
  41. backend/tests/test_export_iiif.py +13 -10
  42. backend/tests/test_export_mets.py +6 -3
  43. backend/tests/test_image_pipeline.py +0 -1
  44. backend/tests/test_job_runner.py +28 -16
  45. backend/tests/test_security.py +215 -0
  46. frontend/src/App.tsx +1 -0
  47. frontend/src/lib/api.ts +8 -0
  48. frontend/src/pages/Editor.tsx +1 -1
  49. frontend/src/pages/Home.tsx +3 -2
  50. infra/Dockerfile +0 -71
Dockerfile CHANGED
@@ -1,6 +1,6 @@
1
  # Scriptorium AI β€” image de production (multi-stage)
2
  # Ce fichier est utilisΓ© par HuggingFace Spaces (SDK docker, dΓ©tection automatique).
3
- # Il doit rester synchronisΓ© avec infra/Dockerfile.
4
  #
5
  # Build depuis la racine du dΓ©pΓ΄t :
6
  # docker build -t scriptorium-ai .
 
1
  # Scriptorium AI β€” image de production (multi-stage)
2
  # Ce fichier est utilisΓ© par HuggingFace Spaces (SDK docker, dΓ©tection automatique).
3
+ # Source unique β€” le fichier infra/Dockerfile a Γ©tΓ© supprimΓ© pour Γ©viter la divergence.
4
  #
5
  # Build depuis la racine du dΓ©pΓ΄t :
6
  # docker build -t scriptorium-ai .
backend/app/api/v1/corpora.py CHANGED
@@ -14,8 +14,8 @@ import uuid
14
  from datetime import datetime, timezone
15
 
16
  # 2. third-party
17
- from fastapi import APIRouter, Depends, HTTPException
18
- from pydantic import BaseModel, ConfigDict
19
  from sqlalchemy import select
20
  from sqlalchemy.ext.asyncio import AsyncSession
21
 
@@ -29,9 +29,9 @@ router = APIRouter(prefix="/corpora", tags=["corpora"])
29
  # ── SchΓ©mas de requΓͺte / rΓ©ponse ─────────────────────────────────────────────
30
 
31
  class CorpusCreate(BaseModel):
32
- slug: str
33
- title: str
34
- profile_id: str
35
 
36
 
37
  class CorpusResponse(BaseModel):
@@ -59,9 +59,13 @@ class ManuscriptResponse(BaseModel):
59
  # ── Endpoints ────────────────────────────────────────────────────────────────
60
 
61
  @router.get("", response_model=list[CorpusResponse])
62
- async def list_corpora(db: AsyncSession = Depends(get_db)) -> list[CorpusModel]:
63
- """Retourne tous les corpus enregistrΓ©s."""
64
- result = await db.execute(select(CorpusModel))
 
 
 
 
65
  return list(result.scalars().all())
66
 
67
 
 
14
  from datetime import datetime, timezone
15
 
16
  # 2. third-party
17
+ from fastapi import APIRouter, Depends, HTTPException, Query
18
+ from pydantic import BaseModel, ConfigDict, Field
19
  from sqlalchemy import select
20
  from sqlalchemy.ext.asyncio import AsyncSession
21
 
 
29
  # ── SchΓ©mas de requΓͺte / rΓ©ponse ─────────────────────────────────────────────
30
 
31
  class CorpusCreate(BaseModel):
32
+ slug: str = Field(..., pattern=r"^[a-z0-9][a-z0-9_-]{0,63}$")
33
+ title: str = Field(..., min_length=1, max_length=256)
34
+ profile_id: str = Field(..., pattern=r"^[a-z0-9][a-z0-9_-]*$")
35
 
36
 
37
  class CorpusResponse(BaseModel):
 
59
  # ── Endpoints ────────────────────────────────────────────────────────────────
60
 
61
  @router.get("", response_model=list[CorpusResponse])
62
+ async def list_corpora(
63
+ db: AsyncSession = Depends(get_db),
64
+ skip: int = Query(0, ge=0, description="Nombre d'Γ©lΓ©ments Γ  sauter"),
65
+ limit: int = Query(100, ge=1, le=1000, description="Nombre maximum d'Γ©lΓ©ments"),
66
+ ) -> list[CorpusModel]:
67
+ """Retourne les corpus enregistrΓ©s (paginΓ©)."""
68
+ result = await db.execute(select(CorpusModel).offset(skip).limit(limit))
69
  return list(result.scalars().all())
70
 
71
 
backend/app/api/v1/export.py CHANGED
@@ -10,6 +10,7 @@ Règle (R02) : toutes les sorties sont générées depuis les PageMasters
10
  (master.json), jamais depuis les rΓ©ponses brutes de l'IA.
11
  """
12
  # 1. stdlib
 
13
  import io
14
  import json
15
  import logging
@@ -66,7 +67,7 @@ async def _load_manuscript_with_masters(
66
 
67
  masters: list[PageMaster] = []
68
  for page in pages:
69
- master = _read_master_json(corpus.slug, page.id)
70
  if master is not None:
71
  masters.append(master)
72
 
@@ -79,8 +80,8 @@ async def _load_manuscript_with_masters(
79
  return manuscript, corpus, masters
80
 
81
 
82
- def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
83
- """Lit le master.json d'une page depuis data/. Retourne None si absent."""
84
  path = (
85
  _config_module.settings.data_dir
86
  / "corpora"
@@ -95,6 +96,11 @@ def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
95
  return PageMaster.model_validate(raw)
96
 
97
 
 
 
 
 
 
98
  def _build_manuscript_meta(
99
  manuscript: ManuscriptModel, corpus: CorpusModel
100
  ) -> dict:
@@ -154,7 +160,7 @@ async def get_alto(page_id: str, db: AsyncSession = Depends(get_db)) -> Response
154
  manuscript = await db.get(ManuscriptModel, page.manuscript_id)
155
  corpus = await db.get(CorpusModel, manuscript.corpus_id)
156
 
157
- master = _read_master_json(corpus.slug, page_id)
158
  if master is None:
159
  raise HTTPException(
160
  status_code=404,
 
10
  (master.json), jamais depuis les rΓ©ponses brutes de l'IA.
11
  """
12
  # 1. stdlib
13
+ import asyncio
14
  import io
15
  import json
16
  import logging
 
67
 
68
  masters: list[PageMaster] = []
69
  for page in pages:
70
+ master = await _read_master_json(corpus.slug, page.id)
71
  if master is not None:
72
  masters.append(master)
73
 
 
80
  return manuscript, corpus, masters
81
 
82
 
83
+ def _read_master_json_sync(corpus_slug: str, page_id: str) -> PageMaster | None:
84
+ """Lit le master.json d'une page depuis data/. Retourne None si absent (bloquant)."""
85
  path = (
86
  _config_module.settings.data_dir
87
  / "corpora"
 
96
  return PageMaster.model_validate(raw)
97
 
98
 
99
+ async def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
100
+ """Version async — délègue la lecture au threadpool."""
101
+ return await asyncio.to_thread(_read_master_json_sync, corpus_slug, page_id)
102
+
103
+
104
  def _build_manuscript_meta(
105
  manuscript: ManuscriptModel, corpus: CorpusModel
106
  ) -> dict:
 
160
  manuscript = await db.get(ManuscriptModel, page.manuscript_id)
161
  corpus = await db.get(CorpusModel, manuscript.corpus_id)
162
 
163
+ master = await _read_master_json(corpus.slug, page_id)
164
  if master is None:
165
  raise HTTPException(
166
  status_code=404,
backend/app/api/v1/ingest.py CHANGED
@@ -11,13 +11,14 @@ Règle : ingestion = création des PageModel en BDD uniquement.
11
  """
12
  # 1. stdlib
13
  import logging
 
14
  import uuid
15
  from pathlib import Path
16
 
17
  # 2. third-party
18
  import httpx
19
  from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
20
- from pydantic import BaseModel
21
  from sqlalchemy import func, select
22
  from sqlalchemy.ext.asyncio import AsyncSession
23
 
@@ -30,6 +31,28 @@ logger = logging.getLogger(__name__)
30
 
31
  router = APIRouter(tags=["ingestion"])
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # ── SchΓ©mas ───────────────────────────────────────────────────────────────────
35
 
@@ -38,8 +61,8 @@ class IIIFManifestRequest(BaseModel):
38
 
39
 
40
  class IIIFImagesRequest(BaseModel):
41
- urls: list[str]
42
- folio_labels: list[str]
43
 
44
 
45
  class IngestResponse(BaseModel):
@@ -144,11 +167,31 @@ _MANIFEST_HEADERS = {
144
  }
145
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  async def _fetch_json_manifest(url: str) -> dict:
148
- """TΓ©lΓ©charge un manifest IIIF. Fonction isolΓ©e pour faciliter les tests."""
 
149
  async with httpx.AsyncClient() as client:
150
  resp = await client.get(url, headers=_MANIFEST_HEADERS, follow_redirects=True, timeout=30.0)
151
  resp.raise_for_status()
 
 
152
  return resp.json()
153
 
154
 
@@ -202,16 +245,33 @@ async def ingest_files(
202
  seq = await _next_sequence(db, ms.id)
203
 
204
  # Collect labels and detect duplicates
205
- labels = [Path(f.filename or f"file_{i}").stem for i, f in enumerate(files)]
206
  dupes = _find_duplicate_labels(labels)
207
 
208
  created: list[PageModel] = []
 
209
  skipped = 0
210
  for i, upload in enumerate(files):
211
- filename = Path(upload.filename or f"file_{i}").name
 
 
 
 
 
 
 
 
212
  folio_label = labels[i]
213
  page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
214
 
 
 
 
 
 
 
 
 
215
  master_dir = (
216
  _config_module.settings.data_dir
217
  / "corpora"
@@ -221,8 +281,8 @@ async def ingest_files(
221
  )
222
  master_dir.mkdir(parents=True, exist_ok=True)
223
  master_path = master_dir / filename
224
- content = await upload.read()
225
  master_path.write_bytes(content)
 
226
 
227
  page = await _create_page(
228
  db, ms.id, page_id, folio_label, seq + i,
@@ -234,7 +294,13 @@ async def ingest_files(
234
  created.append(page)
235
 
236
  ms.total_pages = (ms.total_pages or 0) + len(created)
237
- await db.commit()
 
 
 
 
 
 
238
 
239
  logger.info(
240
  "Fichiers ingΓ©rΓ©s",
@@ -260,6 +326,8 @@ async def ingest_iiif_manifest(
260
 
261
  try:
262
  manifest = await _fetch_json_manifest(body.manifest_url)
 
 
263
  except httpx.HTTPStatusError as exc:
264
  raise HTTPException(
265
  status_code=502,
@@ -302,7 +370,7 @@ async def ingest_iiif_manifest(
302
  seq = await _next_sequence(db, ms.id)
303
 
304
  # Collect labels and detect duplicates
305
- labels = [_extract_canvas_label(canvas, i) for i, canvas in enumerate(canvases)]
306
  dupes = _find_duplicate_labels(labels)
307
 
308
  created: list[PageModel] = []
@@ -358,11 +426,12 @@ async def ingest_iiif_images(
358
  ms = await _get_or_create_manuscript(db, corpus_id)
359
  seq = await _next_sequence(db, ms.id)
360
 
361
- dupes = _find_duplicate_labels(body.folio_labels)
 
362
 
363
  created: list[PageModel] = []
364
  skipped = 0
365
- for i, (url, folio_label) in enumerate(zip(body.urls, body.folio_labels)):
366
  page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
367
  page = await _create_page(
368
  db, ms.id, page_id, folio_label, seq + i,
 
11
  """
12
  # 1. stdlib
13
  import logging
14
+ import re
15
  import uuid
16
  from pathlib import Path
17
 
18
  # 2. third-party
19
  import httpx
20
  from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
21
+ from pydantic import BaseModel, Field
22
  from sqlalchemy import func, select
23
  from sqlalchemy.ext.asyncio import AsyncSession
24
 
 
31
 
32
  router = APIRouter(tags=["ingestion"])
33
 
34
+ # ── Constantes de sΓ©curitΓ© ────────────────────────────────────────────────────
35
+
36
+ _SAFE_LABEL_RE = re.compile(r"^[\w\-\.]+$")
37
+ _MAX_UPLOAD_BYTES = 100 * 1024 * 1024 # 100 Mo par fichier
38
+ _ALLOWED_MIME_PREFIXES = ("image/",)
39
+
40
+
41
+ def _sanitize_label(label: str) -> str:
42
+ """Nettoie un folio_label : garde uniquement alphanum, -, _, ."""
43
+ clean = Path(label).name # retire tout chemin
44
+ if not _SAFE_LABEL_RE.match(clean) or not clean:
45
+ clean = re.sub(r"[^\w\-\.]", "_", clean) or "page"
46
+ return clean
47
+
48
+
49
+ def _sanitize_filename(name: str) -> str:
50
+ """Nettoie un nom de fichier uploadΓ© : garde uniquement le basename sΓ»r."""
51
+ clean = Path(name).name
52
+ if not _SAFE_LABEL_RE.match(clean) or not clean:
53
+ clean = f"{uuid.uuid4().hex[:12]}.bin"
54
+ return clean
55
+
56
 
57
  # ── SchΓ©mas ───────────────────────────────────────────────────────────────────
58
 
 
61
 
62
 
63
  class IIIFImagesRequest(BaseModel):
64
+ urls: list[str] = Field(..., max_length=5000)
65
+ folio_labels: list[str] = Field(..., max_length=5000)
66
 
67
 
68
  class IngestResponse(BaseModel):
 
167
  }
168
 
169
 
170
+ _MAX_MANIFEST_BYTES = 10 * 1024 * 1024 # 10 Mo max pour un manifest JSON
171
+
172
+
173
+ def _validate_url(url: str) -> None:
174
+ """Rejette les URLs non-HTTP et les cibles rΓ©seau privΓ© (SSRF)."""
175
+ from urllib.parse import urlparse
176
+
177
+ parsed = urlparse(url)
178
+ if parsed.scheme not in ("http", "https"):
179
+ raise ValueError(f"SchΓ©ma non autorisΓ© : {parsed.scheme!r}")
180
+ host = (parsed.hostname or "").lower()
181
+ # Bloquer les adresses privΓ©es / locales
182
+ blocked = ("localhost", "127.0.0.1", "0.0.0.0", "[::1]", "metadata.google.internal")
183
+ if host in blocked or host.startswith("169.254.") or host.startswith("10.") or host.startswith("192.168."):
184
+ raise ValueError(f"HΓ΄te interdit : {host}")
185
+
186
+
187
  async def _fetch_json_manifest(url: str) -> dict:
188
+ """TΓ©lΓ©charge un manifest IIIF avec protections SSRF + taille max."""
189
+ _validate_url(url)
190
  async with httpx.AsyncClient() as client:
191
  resp = await client.get(url, headers=_MANIFEST_HEADERS, follow_redirects=True, timeout=30.0)
192
  resp.raise_for_status()
193
+ if len(resp.content) > _MAX_MANIFEST_BYTES:
194
+ raise ValueError(f"Manifest trop volumineux ({len(resp.content)} octets)")
195
  return resp.json()
196
 
197
 
 
245
  seq = await _next_sequence(db, ms.id)
246
 
247
  # Collect labels and detect duplicates
248
+ labels = [_sanitize_label(Path(f.filename or f"file_{i}").stem) for i, f in enumerate(files)]
249
  dupes = _find_duplicate_labels(labels)
250
 
251
  created: list[PageModel] = []
252
+ written_files: list[Path] = []
253
  skipped = 0
254
  for i, upload in enumerate(files):
255
+ # Validation MIME type
256
+ ctype = upload.content_type or ""
257
+ if not any(ctype.startswith(p) for p in _ALLOWED_MIME_PREFIXES):
258
+ raise HTTPException(
259
+ status_code=422,
260
+ detail=f"Type MIME non autorisΓ© : {ctype!r}. Seules les images sont acceptΓ©es.",
261
+ )
262
+
263
+ filename = _sanitize_filename(upload.filename or f"file_{i}.bin")
264
  folio_label = labels[i]
265
  page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
266
 
267
+ content = await upload.read()
268
+ # Validation taille
269
+ if len(content) > _MAX_UPLOAD_BYTES:
270
+ raise HTTPException(
271
+ status_code=413,
272
+ detail=f"Fichier trop volumineux ({len(content)} octets). Maximum : {_MAX_UPLOAD_BYTES}.",
273
+ )
274
+
275
  master_dir = (
276
  _config_module.settings.data_dir
277
  / "corpora"
 
281
  )
282
  master_dir.mkdir(parents=True, exist_ok=True)
283
  master_path = master_dir / filename
 
284
  master_path.write_bytes(content)
285
+ written_files.append(master_path)
286
 
287
  page = await _create_page(
288
  db, ms.id, page_id, folio_label, seq + i,
 
294
  created.append(page)
295
 
296
  ms.total_pages = (ms.total_pages or 0) + len(created)
297
+ try:
298
+ await db.commit()
299
+ except Exception:
300
+ # Nettoyage des fichiers orphelins si le commit BDD Γ©choue
301
+ for f in written_files:
302
+ f.unlink(missing_ok=True)
303
+ raise
304
 
305
  logger.info(
306
  "Fichiers ingΓ©rΓ©s",
 
326
 
327
  try:
328
  manifest = await _fetch_json_manifest(body.manifest_url)
329
+ except ValueError as exc:
330
+ raise HTTPException(status_code=400, detail=str(exc))
331
  except httpx.HTTPStatusError as exc:
332
  raise HTTPException(
333
  status_code=502,
 
370
  seq = await _next_sequence(db, ms.id)
371
 
372
  # Collect labels and detect duplicates
373
+ labels = [_sanitize_label(_extract_canvas_label(canvas, i)) for i, canvas in enumerate(canvases)]
374
  dupes = _find_duplicate_labels(labels)
375
 
376
  created: list[PageModel] = []
 
426
  ms = await _get_or_create_manuscript(db, corpus_id)
427
  seq = await _next_sequence(db, ms.id)
428
 
429
+ sanitized_labels = [_sanitize_label(lbl) for lbl in body.folio_labels]
430
+ dupes = _find_duplicate_labels(sanitized_labels)
431
 
432
  created: list[PageModel] = []
433
  skipped = 0
434
+ for i, (url, folio_label) in enumerate(zip(body.urls, sanitized_labels)):
435
  page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
436
  page = await _create_page(
437
  db, ms.id, page_id, folio_label, seq + i,
backend/app/api/v1/jobs.py CHANGED
@@ -22,8 +22,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
22
  from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
23
  from app.models.database import get_db
24
  from app.models.job import JobModel
25
- from app.services.corpus_runner import execute_corpus_job
26
- from app.services.job_runner import execute_page_job
27
 
28
  router = APIRouter(tags=["jobs"])
29
 
@@ -101,6 +99,8 @@ async def run_corpus(
101
  await db.commit()
102
 
103
  # Lancer le pipeline en arriΓ¨re-plan (aprΓ¨s envoi de la rΓ©ponse)
 
 
104
  background_tasks.add_task(execute_corpus_job, corpus_id)
105
 
106
  return CorpusRunResponse(
@@ -135,6 +135,8 @@ async def run_page(
135
  await db.refresh(job)
136
 
137
  # Lancer le pipeline en arriΓ¨re-plan (aprΓ¨s envoi de la rΓ©ponse)
 
 
138
  background_tasks.add_task(execute_page_job, job.id)
139
 
140
  return job
@@ -175,6 +177,8 @@ async def retry_job(
175
  await db.refresh(job)
176
 
177
  # Relancer le pipeline
 
 
178
  background_tasks.add_task(execute_page_job, job.id)
179
 
180
  return job
 
22
  from app.models.corpus import CorpusModel, ManuscriptModel, PageModel
23
  from app.models.database import get_db
24
  from app.models.job import JobModel
 
 
25
 
26
  router = APIRouter(tags=["jobs"])
27
 
 
99
  await db.commit()
100
 
101
  # Lancer le pipeline en arriΓ¨re-plan (aprΓ¨s envoi de la rΓ©ponse)
102
+ from app.services.corpus_runner import execute_corpus_job
103
+
104
  background_tasks.add_task(execute_corpus_job, corpus_id)
105
 
106
  return CorpusRunResponse(
 
135
  await db.refresh(job)
136
 
137
  # Lancer le pipeline en arriΓ¨re-plan (aprΓ¨s envoi de la rΓ©ponse)
138
+ from app.services.job_runner import execute_page_job
139
+
140
  background_tasks.add_task(execute_page_job, job.id)
141
 
142
  return job
 
177
  await db.refresh(job)
178
 
179
  # Relancer le pipeline
180
+ from app.services.job_runner import execute_page_job
181
+
182
  background_tasks.add_task(execute_page_job, job.id)
183
 
184
  return job
backend/app/api/v1/models_api.py CHANGED
@@ -17,7 +17,7 @@ from datetime import datetime, timezone
17
 
18
  # 2. third-party
19
  from fastapi import APIRouter, Depends, HTTPException
20
- from pydantic import BaseModel, ConfigDict
21
  from sqlalchemy.ext.asyncio import AsyncSession
22
 
23
  # 3. local
@@ -25,11 +25,6 @@ from app.models.corpus import CorpusModel
25
  from app.models.database import get_db
26
  from app.models.model_config_db import ModelConfigDB
27
  from app.schemas.model_config import ProviderType
28
- from app.services.ai.model_registry import (
29
- get_available_providers,
30
- list_all_models,
31
- list_models_for_provider,
32
- )
33
 
34
  logger = logging.getLogger(__name__)
35
 
@@ -47,9 +42,9 @@ class ProviderInfo(BaseModel):
47
 
48
 
49
  class ModelSelectRequest(BaseModel):
50
- model_id: str
51
- provider_type: str
52
- display_name: str = ""
53
 
54
 
55
  class ModelConfigResponse(BaseModel):
@@ -77,6 +72,8 @@ async def list_providers() -> list[dict]:
77
  Un provider est disponible si la variable d'environnement correspondante
78
  est prΓ©sente dans les secrets HuggingFace. Aucune clΓ© n'est exposΓ©e.
79
  """
 
 
80
  return get_available_providers()
81
 
82
 
@@ -91,6 +88,8 @@ async def get_provider_models(provider_type: str) -> list[dict]:
91
  detail=f"Provider inconnu : {provider_type}. "
92
  f"Valeurs acceptΓ©es : {[p.value for p in ProviderType]}",
93
  )
 
 
94
  try:
95
  models = list_models_for_provider(ptype)
96
  except RuntimeError as exc:
@@ -104,6 +103,8 @@ async def get_provider_models(provider_type: str) -> list[dict]:
104
  @router.post("/models/refresh", response_model=ModelsRefreshResponse)
105
  async def refresh_models() -> ModelsRefreshResponse:
106
  """Force la mise Γ  jour de la liste agrΓ©gΓ©e de tous les modΓ¨les disponibles."""
 
 
107
  models = list_all_models()
108
  return ModelsRefreshResponse(
109
  models=[m.model_dump() for m in models],
 
17
 
18
  # 2. third-party
19
  from fastapi import APIRouter, Depends, HTTPException
20
+ from pydantic import BaseModel, ConfigDict, Field
21
  from sqlalchemy.ext.asyncio import AsyncSession
22
 
23
  # 3. local
 
25
  from app.models.database import get_db
26
  from app.models.model_config_db import ModelConfigDB
27
  from app.schemas.model_config import ProviderType
 
 
 
 
 
28
 
29
  logger = logging.getLogger(__name__)
30
 
 
42
 
43
 
44
  class ModelSelectRequest(BaseModel):
45
+ model_id: str = Field(..., min_length=1, max_length=256)
46
+ provider_type: str = Field(..., min_length=1, max_length=64)
47
+ display_name: str = Field("", max_length=256)
48
 
49
 
50
  class ModelConfigResponse(BaseModel):
 
72
  Un provider est disponible si la variable d'environnement correspondante
73
  est prΓ©sente dans les secrets HuggingFace. Aucune clΓ© n'est exposΓ©e.
74
  """
75
+ from app.services.ai.model_registry import get_available_providers
76
+
77
  return get_available_providers()
78
 
79
 
 
88
  detail=f"Provider inconnu : {provider_type}. "
89
  f"Valeurs acceptΓ©es : {[p.value for p in ProviderType]}",
90
  )
91
+ from app.services.ai.model_registry import list_models_for_provider
92
+
93
  try:
94
  models = list_models_for_provider(ptype)
95
  except RuntimeError as exc:
 
103
  @router.post("/models/refresh", response_model=ModelsRefreshResponse)
104
  async def refresh_models() -> ModelsRefreshResponse:
105
  """Force la mise Γ  jour de la liste agrΓ©gΓ©e de tous les modΓ¨les disponibles."""
106
+ from app.services.ai.model_registry import list_all_models
107
+
108
  models = list_all_models()
109
  return ModelsRefreshResponse(
110
  models=[m.model_dump() for m in models],
backend/app/api/v1/pages.py CHANGED
@@ -18,7 +18,7 @@ from typing import Any
18
 
19
  # 2. third-party
20
  from fastapi import APIRouter, Depends, HTTPException
21
- from pydantic import BaseModel, ConfigDict, ValidationError
22
  from sqlalchemy.ext.asyncio import AsyncSession
23
 
24
  # 3. local
@@ -42,12 +42,12 @@ class CorrectionsRequest(BaseModel):
42
  indiquΓ©e est restaurΓ©e (avec incrΓ©mentation de editorial.version).
43
  """
44
 
45
- ocr_diplomatic_text: str | None = None
46
- editorial_status: str | None = None
47
- commentary_public: str | None = None
48
- commentary_scholarly: str | None = None
49
  region_validations: dict[str, str] | None = None
50
- restore_to_version: int | None = None
51
 
52
 
53
  class VersionInfo(BaseModel):
 
18
 
19
  # 2. third-party
20
  from fastapi import APIRouter, Depends, HTTPException
21
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError
22
  from sqlalchemy.ext.asyncio import AsyncSession
23
 
24
  # 3. local
 
42
  indiquΓ©e est restaurΓ©e (avec incrΓ©mentation de editorial.version).
43
  """
44
 
45
+ ocr_diplomatic_text: str | None = Field(None, max_length=500_000)
46
+ editorial_status: str | None = Field(None, max_length=50)
47
+ commentary_public: str | None = Field(None, max_length=100_000)
48
+ commentary_scholarly: str | None = Field(None, max_length=100_000)
49
  region_validations: dict[str, str] | None = None
50
+ restore_to_version: int | None = Field(None, ge=1)
51
 
52
 
53
  class VersionInfo(BaseModel):
backend/app/api/v1/profiles.py CHANGED
@@ -8,8 +8,10 @@ Les profils sont des fichiers JSON dans profiles/ (racine du dΓ©pΓ΄t).
8
  Ils sont validΓ©s par CorpusProfile avant d'Γͺtre retournΓ©s.
9
  """
10
  # 1. stdlib
 
11
  import json
12
  import logging
 
13
  from pathlib import Path
14
 
15
  # 2. third-party
@@ -49,21 +51,36 @@ async def list_profiles() -> list[dict]:
49
  if not settings.profiles_dir.is_dir():
50
  logger.warning("profiles_dir introuvable : %s", settings.profiles_dir)
51
  return []
52
- profiles = []
53
- for path in sorted(settings.profiles_dir.glob("*.json")):
54
- profile = _load_profile(path)
55
- if profile is not None:
56
- profiles.append(profile.model_dump())
57
- return profiles
 
 
 
 
 
 
 
58
 
59
 
60
  @router.get("/{profile_id}", response_model=dict)
61
  async def get_profile(profile_id: str) -> dict:
62
  """Retourne un profil par son id (nom du fichier sans extension)."""
 
 
63
  path = settings.profiles_dir / f"{profile_id}.json"
64
- if not path.exists():
 
 
 
 
 
 
 
65
  raise HTTPException(status_code=404, detail="Profil introuvable")
66
- profile = _load_profile(path)
67
  if profile is None:
68
  raise HTTPException(status_code=422, detail="Profil invalide")
69
  return profile.model_dump()
 
8
  Ils sont validΓ©s par CorpusProfile avant d'Γͺtre retournΓ©s.
9
  """
10
  # 1. stdlib
11
+ import asyncio
12
  import json
13
  import logging
14
+ import re
15
  from pathlib import Path
16
 
17
  # 2. third-party
 
51
  if not settings.profiles_dir.is_dir():
52
  logger.warning("profiles_dir introuvable : %s", settings.profiles_dir)
53
  return []
54
+
55
+ def _scan_profiles() -> list[dict]:
56
+ result = []
57
+ for path in sorted(settings.profiles_dir.glob("*.json")):
58
+ profile = _load_profile(path)
59
+ if profile is not None:
60
+ result.append(profile.model_dump())
61
+ return result
62
+
63
+ return await asyncio.to_thread(_scan_profiles)
64
+
65
+
66
+ _SAFE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9_-]*$")
67
 
68
 
69
  @router.get("/{profile_id}", response_model=dict)
70
  async def get_profile(profile_id: str) -> dict:
71
  """Retourne un profil par son id (nom du fichier sans extension)."""
72
+ if not _SAFE_ID_RE.match(profile_id):
73
+ raise HTTPException(status_code=400, detail="profile_id invalide")
74
  path = settings.profiles_dir / f"{profile_id}.json"
75
+
76
+ def _read() -> CorpusProfile | None:
77
+ if not path.exists():
78
+ return None
79
+ return _load_profile(path)
80
+
81
+ profile = await asyncio.to_thread(_read)
82
+ if profile is None and not path.exists():
83
  raise HTTPException(status_code=404, detail="Profil introuvable")
 
84
  if profile is None:
85
  raise HTTPException(status_code=422, detail="Profil invalide")
86
  return profile.model_dump()
backend/app/api/v1/search.py CHANGED
@@ -7,6 +7,7 @@ ImplΓ©mentation MVP : scan des fichiers master.json (pas d'index externe).
7
  Insensible Γ  la casse et aux accents (unicodedata NFD + ASCII).
8
  """
9
  # 1. stdlib
 
10
  import json
11
  import logging
12
  import unicodedata
@@ -95,7 +96,8 @@ def _score_master(data: dict, query_normalized: str) -> tuple[int, str]:
95
 
96
  @router.get("/search", response_model=list[SearchResult])
97
  async def search_pages(
98
- q: str = Query(..., min_length=2, description="RequΓͺte de recherche (min. 2 caractΓ¨res)"),
 
99
  ) -> list[SearchResult]:
100
  """Recherche plein texte dans les master.json de tous les corpus.
101
 
@@ -106,29 +108,32 @@ async def search_pages(
106
  query_normalized = _normalize(q.strip())
107
  data_dir = _config_module.settings.data_dir
108
 
109
- results: list[SearchResult] = []
110
-
111
- for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
112
- try:
113
- raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
114
- except (json.JSONDecodeError, OSError):
115
- continue
116
-
117
- score, excerpt = _score_master(raw, query_normalized)
118
- if score == 0:
119
- continue
120
-
121
- results.append(
122
- SearchResult(
123
- page_id=raw.get("page_id", ""),
124
- folio_label=raw.get("folio_label", ""),
125
- manuscript_id=raw.get("manuscript_id", ""),
126
- excerpt=excerpt,
127
- score=score,
128
- corpus_profile=raw.get("corpus_profile", ""),
 
 
129
  )
130
- )
 
131
 
132
- results.sort(key=lambda r: r.score, reverse=True)
133
  logger.info("Recherche exΓ©cutΓ©e", extra={"q": q, "results": len(results)})
134
- return results
 
7
  Insensible Γ  la casse et aux accents (unicodedata NFD + ASCII).
8
  """
9
  # 1. stdlib
10
+ import asyncio
11
  import json
12
  import logging
13
  import unicodedata
 
96
 
97
  @router.get("/search", response_model=list[SearchResult])
98
  async def search_pages(
99
+ q: str = Query(..., min_length=2, max_length=500, description="RequΓͺte de recherche (2–500 caractΓ¨res)"),
100
+ limit: int = Query(200, ge=1, le=2000, description="Nombre maximum de rΓ©sultats"),
101
  ) -> list[SearchResult]:
102
  """Recherche plein texte dans les master.json de tous les corpus.
103
 
 
108
  query_normalized = _normalize(q.strip())
109
  data_dir = _config_module.settings.data_dir
110
 
111
+ def _scan() -> list[SearchResult]:
112
+ """Scan bloquant exΓ©cutΓ© dans un thread dΓ©diΓ©."""
113
+ hits: list[SearchResult] = []
114
+ for master_path in data_dir.glob("corpora/*/pages/*/master.json"):
115
+ try:
116
+ raw: dict = json.loads(master_path.read_text(encoding="utf-8"))
117
+ except (json.JSONDecodeError, OSError):
118
+ continue
119
+
120
+ score, excerpt = _score_master(raw, query_normalized)
121
+ if score == 0:
122
+ continue
123
+
124
+ hits.append(
125
+ SearchResult(
126
+ page_id=raw.get("page_id", ""),
127
+ folio_label=raw.get("folio_label", ""),
128
+ manuscript_id=raw.get("manuscript_id", ""),
129
+ excerpt=excerpt,
130
+ score=score,
131
+ corpus_profile=raw.get("corpus_profile", ""),
132
+ )
133
  )
134
+ hits.sort(key=lambda r: r.score, reverse=True)
135
+ return hits
136
 
137
+ results = await asyncio.to_thread(_scan)
138
  logger.info("Recherche exΓ©cutΓ©e", extra={"q": q, "results": len(results)})
139
+ return results[:limit]
backend/app/config.py CHANGED
@@ -1,17 +1,17 @@
1
  """
2
  Configuration globale de la plateforme, chargΓ©e depuis les variables d'environnement.
3
 
4
- Γ‰quivalent fonctionnel de pydantic-settings sans dΓ©pendance externe :
5
- - les valeurs sont lues depuis os.environ au moment de l'instanciation
6
  - l'objet `settings` est importΓ© partout dans l'application
7
  - dans les tests : monkeypatch.setattr(config, "settings", ...) pour surcharger
8
  """
9
  # 1. stdlib
10
- import os
11
  from pathlib import Path
12
 
13
  # 2. third-party
14
- from pydantic import BaseModel, ConfigDict
 
15
 
16
  # Racine du dΓ©pΓ΄t β€” rΓ©solue depuis l'emplacement absolu de ce fichier.
17
  # config.py se trouve dans backend/app/ ; 3 parents remontent Γ  la racine.
@@ -19,14 +19,17 @@ from pydantic import BaseModel, ConfigDict
19
  _REPO_ROOT = Path(__file__).resolve().parent.parent.parent
20
 
21
 
22
- class Settings(BaseModel):
23
  """ParamΓ¨tres d'application lus depuis les variables d'environnement.
24
 
25
  Toutes les clΓ©s API sont optionnelles (None si non configurΓ©es).
26
  Elles ne sont jamais loguΓ©es ni exportΓ©es (R06).
27
  """
28
 
29
- model_config = ConfigDict(frozen=False)
 
 
 
30
 
31
  # ── Serveur ──────────────────────────────────────────────────────────────
32
  base_url: str = "http://localhost:8000"
@@ -50,21 +53,4 @@ class Settings(BaseModel):
50
  mistral_api_key: str | None = None
51
 
52
 
53
- def _load_settings() -> Settings:
54
- """Lit les variables d'environnement et construit l'objet Settings."""
55
- return Settings(
56
- base_url=os.getenv("BASE_URL", "http://localhost:8000"),
57
- data_dir=Path(os.getenv("DATA_DIR", "data")),
58
- profiles_dir=Path(os.getenv("PROFILES_DIR", str(_REPO_ROOT / "profiles"))),
59
- prompts_dir=Path(os.getenv("PROMPTS_DIR", str(_REPO_ROOT / "prompts"))),
60
- database_url=os.getenv(
61
- "DATABASE_URL", "sqlite+aiosqlite:///./scriptorium.db"
62
- ),
63
- google_ai_studio_api_key=os.getenv("GOOGLE_AI_STUDIO_API_KEY"),
64
- vertex_api_key=os.getenv("VERTEX_API_KEY"),
65
- vertex_service_account_json=os.getenv("VERTEX_SERVICE_ACCOUNT_JSON"),
66
- mistral_api_key=os.getenv("MISTRAL_API_KEY"),
67
- )
68
-
69
-
70
- settings: Settings = _load_settings()
 
1
  """
2
  Configuration globale de la plateforme, chargΓ©e depuis les variables d'environnement.
3
 
4
+ Utilise pydantic-settings (CLAUDE.md Β§2, Β§7) :
5
+ - les valeurs sont lues depuis os.environ / fichier .env au moment de l'instanciation
6
  - l'objet `settings` est importΓ© partout dans l'application
7
  - dans les tests : monkeypatch.setattr(config, "settings", ...) pour surcharger
8
  """
9
  # 1. stdlib
 
10
  from pathlib import Path
11
 
12
  # 2. third-party
13
+ from pydantic import ConfigDict
14
+ from pydantic_settings import BaseSettings
15
 
16
  # Racine du dΓ©pΓ΄t β€” rΓ©solue depuis l'emplacement absolu de ce fichier.
17
  # config.py se trouve dans backend/app/ ; 3 parents remontent Γ  la racine.
 
19
  _REPO_ROOT = Path(__file__).resolve().parent.parent.parent
20
 
21
 
22
+ class Settings(BaseSettings):
23
  """ParamΓ¨tres d'application lus depuis les variables d'environnement.
24
 
25
  Toutes les clΓ©s API sont optionnelles (None si non configurΓ©es).
26
  Elles ne sont jamais loguΓ©es ni exportΓ©es (R06).
27
  """
28
 
29
+ model_config = ConfigDict(
30
+ env_file=".env",
31
+ extra="ignore",
32
+ )
33
 
34
  # ── Serveur ──────────────────────────────────────────────────────────────
35
  base_url: str = "http://localhost:8000"
 
53
  mistral_api_key: str | None = None
54
 
55
 
56
+ settings: Settings = Settings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/main.py CHANGED
@@ -65,11 +65,11 @@ app = FastAPI(
65
  lifespan=lifespan,
66
  )
67
 
68
- # ── CORS (dev : tous les origines autorisΓ©s) ──────────────────────────────────
69
  app.add_middleware(
70
  CORSMiddleware,
71
  allow_origins=["*"],
72
- allow_credentials=True,
73
  allow_methods=["*"],
74
  allow_headers=["*"],
75
  )
@@ -97,8 +97,9 @@ async def serve_frontend(full_path: str) -> FileResponse | RedirectResponse:
97
  if full_path.startswith("api/"):
98
  raise HTTPException(status_code=404, detail=f"Endpoint not found: /{full_path}")
99
  if _STATIC_DIR.is_dir():
100
- candidate = _STATIC_DIR / full_path
101
- if candidate.is_file():
 
102
  return FileResponse(candidate)
103
  index = _STATIC_DIR / "index.html"
104
  if index.exists():
 
65
  lifespan=lifespan,
66
  )
67
 
68
+ # ── CORS (dev : toutes les origines autorisΓ©es, sans credentials) ──────────────
69
  app.add_middleware(
70
  CORSMiddleware,
71
  allow_origins=["*"],
72
+ allow_credentials=False,
73
  allow_methods=["*"],
74
  allow_headers=["*"],
75
  )
 
97
  if full_path.startswith("api/"):
98
  raise HTTPException(status_code=404, detail=f"Endpoint not found: /{full_path}")
99
  if _STATIC_DIR.is_dir():
100
+ candidate = (_STATIC_DIR / full_path).resolve()
101
+ # EmpΓͺcher le path traversal : le fichier rΓ©solu doit Γͺtre sous _STATIC_DIR
102
+ if candidate.is_file() and str(candidate).startswith(str(_STATIC_DIR.resolve())):
103
  return FileResponse(candidate)
104
  index = _STATIC_DIR / "index.html"
105
  if index.exists():
backend/app/models/corpus.py CHANGED
@@ -6,6 +6,7 @@ Ils NE se substituent PAS aux schΓ©mas Pydantic (source canonique des types).
6
  """
7
  # 1. stdlib
8
  from datetime import datetime, timezone
 
9
 
10
  # 2. third-party
11
  from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, Text
@@ -24,8 +25,12 @@ class CorpusModel(Base):
24
  slug: Mapped[str] = mapped_column(String, unique=True, nullable=False, index=True)
25
  title: Mapped[str] = mapped_column(String, nullable=False)
26
  profile_id: Mapped[str] = mapped_column(String, nullable=False)
27
- created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
28
- updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
 
 
 
 
29
 
30
  manuscripts: Mapped[list["ManuscriptModel"]] = relationship(
31
  back_populates="corpus", cascade="all, delete-orphan"
 
6
  """
7
  # 1. stdlib
8
  from datetime import datetime, timezone
9
+ from functools import partial
10
 
11
  # 2. third-party
12
  from sqlalchemy import DateTime, Float, ForeignKey, Integer, String, Text
 
25
  slug: Mapped[str] = mapped_column(String, unique=True, nullable=False, index=True)
26
  title: Mapped[str] = mapped_column(String, nullable=False)
27
  profile_id: Mapped[str] = mapped_column(String, nullable=False)
28
+ created_at: Mapped[datetime] = mapped_column(
29
+ DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
30
+ )
31
+ updated_at: Mapped[datetime] = mapped_column(
32
+ DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
33
+ )
34
 
35
  manuscripts: Mapped[list["ManuscriptModel"]] = relationship(
36
  back_populates="corpus", cascade="all, delete-orphan"
backend/app/models/job.py CHANGED
@@ -10,7 +10,8 @@ Cycle de vie :
10
  β†˜ failed
11
  """
12
  # 1. stdlib
13
- from datetime import datetime
 
14
 
15
  # 2. third-party
16
  from sqlalchemy import DateTime, ForeignKey, String, Text
@@ -37,4 +38,6 @@ class JobModel(Base):
37
  started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
38
  finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
39
  error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
40
- created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
 
 
 
10
  β†˜ failed
11
  """
12
  # 1. stdlib
13
+ from datetime import datetime, timezone
14
+ from functools import partial
15
 
16
  # 2. third-party
17
  from sqlalchemy import DateTime, ForeignKey, String, Text
 
38
  started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
39
  finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
40
  error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
41
+ created_at: Mapped[datetime] = mapped_column(
42
+ DateTime, nullable=False, default=partial(datetime.now, tz=timezone.utc)
43
+ )
backend/app/schemas/page_master.py CHANGED
@@ -29,14 +29,25 @@ class Region(BaseModel):
29
 
30
  @field_validator("bbox")
31
  @classmethod
32
- def bbox_must_be_positive(cls, v: list[int]) -> list[int]:
33
  if any(x < 0 for x in v):
34
- raise ValueError("bbox values must be >= 0")
35
  if v[2] <= 0 or v[3] <= 0:
36
- raise ValueError("bbox width and height must be > 0")
37
  return v
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
40
  class OCRResult(BaseModel):
41
  diplomatic_text: str = ""
42
  blocks: list[dict] = []
@@ -51,6 +62,13 @@ class Translation(BaseModel):
51
  en: str = ""
52
 
53
 
 
 
 
 
 
 
 
54
  class CommentaryClaim(BaseModel):
55
  claim: str
56
  evidence_region_ids: list[str] = []
@@ -64,6 +82,7 @@ class Commentary(BaseModel):
64
 
65
 
66
  class ProcessingInfo(BaseModel):
 
67
  model_id: str
68
  model_display_name: str
69
  prompt_version: str
@@ -96,11 +115,11 @@ class PageMaster(BaseModel):
96
  folio_label: str
97
  sequence: int
98
 
99
- image: dict
100
  layout: dict
101
  ocr: OCRResult | None = None
102
  translation: Translation | None = None
103
- summary: dict | None = None
104
  commentary: Commentary | None = None
105
  extensions: dict[str, Any] = {}
106
 
 
29
 
30
  @field_validator("bbox")
31
  @classmethod
32
+ def bbox_must_be_valid(cls, v: list[int]) -> list[int]:
33
  if any(x < 0 for x in v):
34
+ raise ValueError("bbox: toutes les valeurs doivent Γͺtre >= 0")
35
  if v[2] <= 0 or v[3] <= 0:
36
+ raise ValueError("bbox: width et height doivent Γͺtre > 0")
37
  return v
38
 
39
 
40
+ class ImageInfo(BaseModel):
41
+ """MΓ©tadonnΓ©es image β€” CLAUDE.md Β§4.2."""
42
+
43
+ master: str
44
+ derivative_web: str | None = None
45
+ thumbnail: str | None = None
46
+ iiif_base: str | None = None
47
+ width: int
48
+ height: int
49
+
50
+
51
  class OCRResult(BaseModel):
52
  diplomatic_text: str = ""
53
  blocks: list[dict] = []
 
62
  en: str = ""
63
 
64
 
65
+ class Summary(BaseModel):
66
+ """RΓ©sumΓ© β€” CLAUDE.md Β§4.2."""
67
+
68
+ short: str = ""
69
+ detailed: str = ""
70
+
71
+
72
  class CommentaryClaim(BaseModel):
73
  claim: str
74
  evidence_region_ids: list[str] = []
 
82
 
83
 
84
  class ProcessingInfo(BaseModel):
85
+ provider: str
86
  model_id: str
87
  model_display_name: str
88
  prompt_version: str
 
115
  folio_label: str
116
  sequence: int
117
 
118
+ image: ImageInfo
119
  layout: dict
120
  ocr: OCRResult | None = None
121
  translation: Translation | None = None
122
+ summary: Summary | None = None
123
  commentary: Commentary | None = None
124
  extensions: dict[str, Any] = {}
125
 
backend/app/services/ai/__init__.py CHANGED
@@ -1,19 +1,31 @@
1
  """
2
  Services AI β€” providers Google AI, registre de modΓ¨les, et analyse IA.
 
 
 
 
3
  """
4
- from app.services.ai.analyzer import run_primary_analysis
5
- from app.services.ai.client_factory import build_client
6
- from app.services.ai.model_registry import build_model_config, list_all_models
7
- from app.services.ai.prompt_loader import load_and_render_prompt
8
- from app.services.ai.provider_google_ai import GoogleAIProvider
9
- from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
10
- from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
11
- from app.services.ai.response_parser import ParseError, parse_ai_response
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  __all__ = [
14
- "GoogleAIProvider",
15
- "VertexAPIKeyProvider",
16
- "VertexServiceAccountProvider",
17
  "list_all_models",
18
  "build_model_config",
19
  "build_client",
 
1
  """
2
  Services AI β€” providers Google AI, registre de modΓ¨les, et analyse IA.
3
+
4
+ Les imports de providers sont diffΓ©rΓ©s (lazy) pour Γ©viter de charger les SDK
5
+ tiers (google-genai, mistralai) au dΓ©marrage. Cela permet Γ  l'application
6
+ de fonctionner mΓͺme si un SDK n'est pas installΓ©.
7
  """
8
+
9
+
10
+ def __getattr__(name: str):
11
+ """Import paresseux — les symboles sont résolus au premier accès."""
12
+ _lazy_map = {
13
+ "run_primary_analysis": "app.services.ai.analyzer",
14
+ "build_client": "app.services.ai.client_factory",
15
+ "build_model_config": "app.services.ai.model_registry",
16
+ "list_all_models": "app.services.ai.model_registry",
17
+ "load_and_render_prompt": "app.services.ai.prompt_loader",
18
+ "parse_ai_response": "app.services.ai.response_parser",
19
+ "ParseError": "app.services.ai.response_parser",
20
+ }
21
+ if name in _lazy_map:
22
+ import importlib
23
+ module = importlib.import_module(_lazy_map[name])
24
+ return getattr(module, name)
25
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
26
+
27
 
28
  __all__ = [
 
 
 
29
  "list_all_models",
30
  "build_model_config",
31
  "build_client",
backend/app/services/ai/analyzer.py CHANGED
@@ -13,8 +13,8 @@ from pathlib import Path
13
  from app.schemas.corpus_profile import CorpusProfile
14
  from app.schemas.image import ImageDerivativeInfo
15
  from app.schemas.model_config import ModelConfig
16
- from app.schemas.page_master import EditorialInfo, EditorialStatus, PageMaster, ProcessingInfo
17
- from app.services.ai.master_writer import write_gemini_raw, write_master_json
18
  from app.services.ai.model_registry import get_provider
19
  from app.services.ai.prompt_loader import load_and_render_prompt
20
  from app.services.ai.response_parser import ParseError, parse_ai_response # noqa: F401
@@ -37,7 +37,7 @@ def run_primary_analysis(
37
  ) -> PageMaster:
38
  """Analyse primaire d'un folio : charge le prompt, appelle l'IA, Γ©crit les fichiers.
39
 
40
- Respecte R05 : gemini_raw.json est toujours Γ©crit en premier, mΓͺme en cas
41
  d'erreur de parsing. master.json n'est Γ©crit QUE si le parsing a rΓ©ussi.
42
 
43
  Le provider est sΓ©lectionnΓ© dynamiquement depuis model_config.provider ;
@@ -57,7 +57,7 @@ def run_primary_analysis(
57
  project_root: racine du projet (pour rΓ©soudre les chemins des prompts).
58
 
59
  Returns:
60
- PageMaster validΓ© (gemini_raw.json et master.json Γ©crits sur disque).
61
 
62
  Raises:
63
  ParseError: si la rΓ©ponse IA n'est pas un JSON valide.
@@ -66,7 +66,7 @@ def run_primary_analysis(
66
  """
67
  # ── Chemins de sortie ───────────────────────────────────────────────────
68
  page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
69
- raw_path = page_dir / "gemini_raw.json"
70
  master_path = page_dir / "master.json"
71
 
72
  # ── 1. Chargement et rendu du prompt (R04) ──────────────────────────────
@@ -76,6 +76,7 @@ def run_primary_analysis(
76
  context = {
77
  "profile_label": corpus_profile.label,
78
  "language_hints": ", ".join(corpus_profile.language_hints),
 
79
  "script_type": corpus_profile.script_type.value,
80
  }
81
  prompt_text = load_and_render_prompt(prompt_abs_path, context)
@@ -85,7 +86,12 @@ def run_primary_analysis(
85
  )
86
 
87
  # ── 2. Chargement de l'image dΓ©rivΓ©e ────────────────────────────────────
88
- jpeg_bytes = derivative_image_path.read_bytes()
 
 
 
 
 
89
 
90
  # ── 3. Appel IA via le provider sΓ©lectionnΓ© ─────────────────────────────
91
  provider = get_provider(model_config.provider)
@@ -104,8 +110,8 @@ def run_primary_analysis(
104
  model_id=model_config.selected_model_id,
105
  )
106
 
107
- # ── 4. Γ‰criture gemini_raw.json TOUJOURS EN PREMIER (R05) ───────────────
108
- write_gemini_raw(raw_text, raw_path)
109
 
110
  # ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
111
  layout, ocr = parse_ai_response(raw_text)
@@ -118,16 +124,17 @@ def run_primary_analysis(
118
  manuscript_id=manuscript_id,
119
  folio_label=folio_label,
120
  sequence=sequence,
121
- image={
122
- "original_url": image_info.original_url,
123
- "derivative_web": image_info.derivative_path,
124
- "thumbnail": image_info.thumbnail_path,
125
- "width": image_info.derivative_width,
126
- "height": image_info.derivative_height,
127
- },
128
  layout=layout,
129
  ocr=ocr,
130
  processing=ProcessingInfo(
 
131
  model_id=model_config.selected_model_id,
132
  model_display_name=model_config.selected_model_display_name,
133
  prompt_version=prompt_rel_path,
 
13
  from app.schemas.corpus_profile import CorpusProfile
14
  from app.schemas.image import ImageDerivativeInfo
15
  from app.schemas.model_config import ModelConfig
16
+ from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
17
+ from app.services.ai.master_writer import write_ai_raw, write_master_json
18
  from app.services.ai.model_registry import get_provider
19
  from app.services.ai.prompt_loader import load_and_render_prompt
20
  from app.services.ai.response_parser import ParseError, parse_ai_response # noqa: F401
 
37
  ) -> PageMaster:
38
  """Analyse primaire d'un folio : charge le prompt, appelle l'IA, Γ©crit les fichiers.
39
 
40
+ Respecte R05 : ai_raw.json est toujours Γ©crit en premier, mΓͺme en cas
41
  d'erreur de parsing. master.json n'est Γ©crit QUE si le parsing a rΓ©ussi.
42
 
43
  Le provider est sΓ©lectionnΓ© dynamiquement depuis model_config.provider ;
 
57
  project_root: racine du projet (pour rΓ©soudre les chemins des prompts).
58
 
59
  Returns:
60
+ PageMaster validΓ© (ai_raw.json et master.json Γ©crits sur disque).
61
 
62
  Raises:
63
  ParseError: si la rΓ©ponse IA n'est pas un JSON valide.
 
66
  """
67
  # ── Chemins de sortie ───────────────────────────────────────────────────
68
  page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
69
+ raw_path = page_dir / "ai_raw.json"
70
  master_path = page_dir / "master.json"
71
 
72
  # ── 1. Chargement et rendu du prompt (R04) ──────────────────────────────
 
76
  context = {
77
  "profile_label": corpus_profile.label,
78
  "language_hints": ", ".join(corpus_profile.language_hints),
79
+ "primary_language": corpus_profile.language_hints[0] if corpus_profile.language_hints else "la",
80
  "script_type": corpus_profile.script_type.value,
81
  }
82
  prompt_text = load_and_render_prompt(prompt_abs_path, context)
 
86
  )
87
 
88
  # ── 2. Chargement de l'image dΓ©rivΓ©e ────────────────────────────────────
89
+ if not derivative_image_path.exists():
90
+ raise FileNotFoundError(f"Image dΓ©rivΓ©e introuvable : {derivative_image_path}")
91
+ try:
92
+ jpeg_bytes = derivative_image_path.read_bytes()
93
+ except OSError as exc:
94
+ raise RuntimeError(f"Erreur lecture image {derivative_image_path} : {exc}") from exc
95
 
96
  # ── 3. Appel IA via le provider sΓ©lectionnΓ© ─────────────────────────────
97
  provider = get_provider(model_config.provider)
 
110
  model_id=model_config.selected_model_id,
111
  )
112
 
113
+ # ── 4. Γ‰criture ai_raw.json TOUJOURS EN PREMIER (R05) ─────────────────
114
+ write_ai_raw(raw_text, raw_path)
115
 
116
  # ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
117
  layout, ocr = parse_ai_response(raw_text)
 
124
  manuscript_id=manuscript_id,
125
  folio_label=folio_label,
126
  sequence=sequence,
127
+ image=ImageInfo(
128
+ master=image_info.original_url,
129
+ derivative_web=image_info.derivative_path,
130
+ thumbnail=image_info.thumbnail_path,
131
+ width=image_info.derivative_width,
132
+ height=image_info.derivative_height,
133
+ ),
134
  layout=layout,
135
  ocr=ocr,
136
  processing=ProcessingInfo(
137
+ provider=model_config.provider.value if hasattr(model_config.provider, "value") else str(model_config.provider),
138
  model_id=model_config.selected_model_id,
139
  model_display_name=model_config.selected_model_display_name,
140
  prompt_version=prompt_rel_path,
backend/app/services/ai/master_writer.py CHANGED
@@ -1,8 +1,8 @@
1
  """
2
- Γ‰criture des fichiers gemini_raw.json et master.json (R02, R05).
3
 
4
  RΓ¨gle R05 non nΓ©gociable :
5
- 1. gemini_raw.json est TOUJOURS Γ©crit en premier.
6
  2. master.json n'est Γ©crit QUE si le parsing et la validation Pydantic ont rΓ©ussi.
7
  """
8
  # 1. stdlib
@@ -16,24 +16,24 @@ from app.schemas.page_master import PageMaster
16
  logger = logging.getLogger(__name__)
17
 
18
 
19
- def write_gemini_raw(raw_text: str, output_path: Path) -> None:
20
- """Γ‰crit la rΓ©ponse brute de l'IA dans gemini_raw.json (R05).
21
 
22
  Toujours appelΓ© AVANT toute tentative de parsing.
23
  Le contenu est enveloppΓ© dans un objet JSON pour garantir un fichier valide,
24
  mΓͺme si la rΓ©ponse IA n'est pas du JSON.
25
-
26
- Args:
27
- raw_text: texte brut retournΓ© par l'API Google AI.
28
- output_path: chemin complet du fichier de sortie (gemini_raw.json).
29
  """
30
- output_path.parent.mkdir(parents=True, exist_ok=True)
31
- payload = {"response_text": raw_text}
32
- output_path.write_text(
33
- json.dumps(payload, ensure_ascii=False, indent=2),
34
- encoding="utf-8",
35
- )
36
- logger.info("gemini_raw.json Γ©crit", extra={"path": str(output_path)})
 
 
 
 
37
 
38
 
39
  def write_master_json(page_master: PageMaster, output_path: Path) -> None:
@@ -41,14 +41,14 @@ def write_master_json(page_master: PageMaster, output_path: Path) -> None:
41
 
42
  N'est appelΓ© QUE si le parsing et la validation Pydantic ont rΓ©ussi.
43
  CrΓ©e les dossiers parents si nΓ©cessaire.
44
-
45
- Args:
46
- page_master: instance PageMaster validΓ©e par Pydantic.
47
- output_path: chemin complet du fichier de sortie (master.json).
48
  """
49
- output_path.parent.mkdir(parents=True, exist_ok=True)
50
- output_path.write_text(
51
- page_master.model_dump_json(indent=2),
52
- encoding="utf-8",
53
- )
 
 
 
 
54
  logger.info("master.json Γ©crit", extra={"path": str(output_path)})
 
1
  """
2
+ Γ‰criture des fichiers ai_raw.json et master.json (R02, R05).
3
 
4
  RΓ¨gle R05 non nΓ©gociable :
5
+ 1. ai_raw.json est TOUJOURS Γ©crit en premier.
6
  2. master.json n'est Γ©crit QUE si le parsing et la validation Pydantic ont rΓ©ussi.
7
  """
8
  # 1. stdlib
 
16
  logger = logging.getLogger(__name__)
17
 
18
 
19
+ def write_ai_raw(raw_text: str, output_path: Path) -> None:
20
+ """Γ‰crit la rΓ©ponse brute de l'IA dans ai_raw.json (R05).
21
 
22
  Toujours appelΓ© AVANT toute tentative de parsing.
23
  Le contenu est enveloppΓ© dans un objet JSON pour garantir un fichier valide,
24
  mΓͺme si la rΓ©ponse IA n'est pas du JSON.
 
 
 
 
25
  """
26
+ try:
27
+ output_path.parent.mkdir(parents=True, exist_ok=True)
28
+ payload = {"response_text": raw_text}
29
+ output_path.write_text(
30
+ json.dumps(payload, ensure_ascii=False, indent=2),
31
+ encoding="utf-8",
32
+ )
33
+ except OSError as exc:
34
+ logger.error("Γ‰criture ai_raw.json Γ©chouΓ©e", extra={"path": str(output_path), "error": str(exc)})
35
+ raise
36
+ logger.info("ai_raw.json Γ©crit", extra={"path": str(output_path)})
37
 
38
 
39
  def write_master_json(page_master: PageMaster, output_path: Path) -> None:
 
41
 
42
  N'est appelΓ© QUE si le parsing et la validation Pydantic ont rΓ©ussi.
43
  CrΓ©e les dossiers parents si nΓ©cessaire.
 
 
 
 
44
  """
45
+ try:
46
+ output_path.parent.mkdir(parents=True, exist_ok=True)
47
+ output_path.write_text(
48
+ page_master.model_dump_json(indent=2),
49
+ encoding="utf-8",
50
+ )
51
+ except OSError as exc:
52
+ logger.error("Γ‰criture master.json Γ©chouΓ©e", extra={"path": str(output_path), "error": str(exc)})
53
+ raise
54
  logger.info("master.json Γ©crit", extra={"path": str(output_path)})
backend/app/services/ai/model_registry.py CHANGED
@@ -1,5 +1,8 @@
1
  """
2
  Registre agrΓ©gΓ© des modΓ¨les disponibles tous providers confondus.
 
 
 
3
  """
4
  # 1. stdlib
5
  import logging
@@ -8,10 +11,6 @@ from datetime import datetime, timezone
8
  # 2. local
9
  from app.schemas.model_config import ModelConfig, ModelInfo, ProviderType
10
  from app.services.ai.base import AIProvider
11
- from app.services.ai.provider_google_ai import GoogleAIProvider
12
- from app.services.ai.provider_mistral import MistralProvider
13
- from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
14
- from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
15
 
16
  logger = logging.getLogger(__name__)
17
 
@@ -24,13 +23,27 @@ _PROVIDER_DISPLAY_NAMES: dict[ProviderType, str] = {
24
  }
25
 
26
 
 
 
 
27
  def _build_providers() -> list[AIProvider]:
28
- return [
 
 
 
 
 
 
 
 
 
 
29
  GoogleAIProvider(),
30
  VertexAPIKeyProvider(),
31
  VertexServiceAccountProvider(),
32
  MistralProvider(),
33
  ]
 
34
 
35
 
36
  def get_available_providers() -> list[dict]:
 
1
  """
2
  Registre agrΓ©gΓ© des modΓ¨les disponibles tous providers confondus.
3
+
4
+ Les imports de providers sont diffΓ©rΓ©s dans _build_providers() pour Γ©viter
5
+ de charger les SDK tiers (google-genai, mistralai) au niveau module.
6
  """
7
  # 1. stdlib
8
  import logging
 
11
  # 2. local
12
  from app.schemas.model_config import ModelConfig, ModelInfo, ProviderType
13
  from app.services.ai.base import AIProvider
 
 
 
 
14
 
15
  logger = logging.getLogger(__name__)
16
 
 
23
  }
24
 
25
 
26
+ _cached_providers: list[AIProvider] | None = None
27
+
28
+
29
  def _build_providers() -> list[AIProvider]:
30
+ """Construit la liste des providers β€” imports diffΓ©rΓ©s, rΓ©sultat mis en cache."""
31
+ global _cached_providers
32
+ if _cached_providers is not None:
33
+ return _cached_providers
34
+
35
+ from app.services.ai.provider_google_ai import GoogleAIProvider
36
+ from app.services.ai.provider_mistral import MistralProvider
37
+ from app.services.ai.provider_vertex_key import VertexAPIKeyProvider
38
+ from app.services.ai.provider_vertex_sa import VertexServiceAccountProvider
39
+
40
+ _cached_providers = [
41
  GoogleAIProvider(),
42
  VertexAPIKeyProvider(),
43
  VertexServiceAccountProvider(),
44
  MistralProvider(),
45
  ]
46
+ return _cached_providers
47
 
48
 
49
  def get_available_providers() -> list[dict]:
backend/app/services/ai/prompt_loader.py CHANGED
@@ -6,6 +6,7 @@ Le code charge le fichier, substitue les variables {{nom}}, envoie Γ  l'API.
6
  """
7
  # 1. stdlib
8
  import logging
 
9
  from pathlib import Path
10
 
11
  logger = logging.getLogger(__name__)
@@ -38,6 +39,11 @@ def load_and_render_prompt(template_path: str | Path, context: dict[str, str]) -
38
  for key, value in context.items():
39
  rendered = rendered.replace("{{" + key + "}}", value)
40
 
 
 
 
 
 
41
  logger.debug(
42
  "Prompt chargΓ© et rendu",
43
  extra={"template": str(path), "variables": list(context.keys())},
 
6
  """
7
  # 1. stdlib
8
  import logging
9
+ import re
10
  from pathlib import Path
11
 
12
  logger = logging.getLogger(__name__)
 
39
  for key, value in context.items():
40
  rendered = rendered.replace("{{" + key + "}}", value)
41
 
42
+ # VΓ©rifier qu'il ne reste pas de variables non rΓ©solues (CLAUDE.md Β§8)
43
+ unresolved = re.findall(r"\{\{\w+\}\}", rendered)
44
+ if unresolved:
45
+ raise ValueError(f"Variables non rΓ©solues dans le prompt : {unresolved}")
46
+
47
  logger.debug(
48
  "Prompt chargΓ© et rendu",
49
  extra={"template": str(path), "variables": list(context.keys())},
backend/app/services/ai/provider_google_ai.py CHANGED
@@ -60,8 +60,15 @@ class GoogleAIProvider(AIProvider):
60
  raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
61
  client = genai.Client(api_key=os.environ[_ENV_KEY])
62
  image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
63
- response = client.models.generate_content(
64
- model=model_id,
65
- contents=[image_part, prompt],
66
- )
 
 
 
 
 
 
 
67
  return response.text or ""
 
60
  raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
61
  client = genai.Client(api_key=os.environ[_ENV_KEY])
62
  image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
63
+ try:
64
+ response = client.models.generate_content(
65
+ model=model_id,
66
+ contents=[image_part, prompt],
67
+ )
68
+ except Exception as exc:
69
+ logger.error(
70
+ "Appel API Google AI Studio Γ©chouΓ©",
71
+ extra={"model": model_id, "error": str(exc)},
72
+ )
73
+ raise RuntimeError(f"Erreur API Google AI Studio ({model_id}) : {exc}") from exc
74
  return response.text or ""
backend/app/services/ai/provider_mistral.py CHANGED
@@ -208,10 +208,14 @@ class MistralProvider(AIProvider):
208
  # ── Chemin 1 : OCR dΓ©diΓ© ─────────────────────────────────────────────
209
  if _is_ocr_model(model_id):
210
  logger.info("Mistral OCR : endpoint dΓ©diΓ© client.ocr.process()", extra={"model": model_id})
211
- response = client.ocr.process(
212
- model=model_id,
213
- document={"type": "image_url", "image_url": {"url": data_url}},
214
- )
 
 
 
 
215
  # OCRResponse.pages : list[OCRPageObject], chacun avec .markdown
216
  pages = getattr(response, "pages", []) or []
217
  return "\n\n".join(
@@ -233,10 +237,14 @@ class MistralProvider(AIProvider):
233
  )
234
  content = prompt
235
 
236
- response = client.chat.complete(
237
- model=model_id,
238
- messages=[{"role": "user", "content": content}],
239
- )
 
 
 
 
240
  choices = response.choices or []
241
  if not choices:
242
  return ""
 
208
  # ── Chemin 1 : OCR dΓ©diΓ© ─────────────────────────────────────────────
209
  if _is_ocr_model(model_id):
210
  logger.info("Mistral OCR : endpoint dΓ©diΓ© client.ocr.process()", extra={"model": model_id})
211
+ try:
212
+ response = client.ocr.process(
213
+ model=model_id,
214
+ document={"type": "image_url", "image_url": {"url": data_url}},
215
+ )
216
+ except Exception as exc:
217
+ logger.error("Appel Mistral OCR Γ©chouΓ©", extra={"model": model_id, "error": str(exc)})
218
+ raise RuntimeError(f"Erreur API Mistral OCR ({model_id}) : {exc}") from exc
219
  # OCRResponse.pages : list[OCRPageObject], chacun avec .markdown
220
  pages = getattr(response, "pages", []) or []
221
  return "\n\n".join(
 
237
  )
238
  content = prompt
239
 
240
+ try:
241
+ response = client.chat.complete(
242
+ model=model_id,
243
+ messages=[{"role": "user", "content": content}],
244
+ )
245
+ except Exception as exc:
246
+ logger.error("Appel Mistral chat Γ©chouΓ©", extra={"model": model_id, "error": str(exc)})
247
+ raise RuntimeError(f"Erreur API Mistral ({model_id}) : {exc}") from exc
248
  choices = response.choices or []
249
  if not choices:
250
  return ""
backend/app/services/ai/provider_vertex_key.py CHANGED
@@ -22,9 +22,6 @@ retourne toujours False afin d'Γ©viter des appels rΓ©seau vouΓ©s Γ  l'Γ©chec.
22
  import logging
23
  import os
24
 
25
- # 2. third-party
26
- from google.genai import types # noqa: F401 (conservΓ© pour import cohΓ©rence)
27
-
28
  # 3. local
29
  from app.schemas.model_config import ModelInfo, ProviderType
30
  from app.services.ai.base import AIProvider
 
22
  import logging
23
  import os
24
 
 
 
 
25
  # 3. local
26
  from app.schemas.model_config import ModelInfo, ProviderType
27
  from app.services.ai.base import AIProvider
backend/app/services/ai/provider_vertex_sa.py CHANGED
@@ -90,8 +90,15 @@ class VertexServiceAccountProvider(AIProvider):
90
  raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
91
  client = self._build_client()
92
  image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
93
- response = client.models.generate_content(
94
- model=model_id,
95
- contents=[image_part, prompt],
96
- )
 
 
 
 
 
 
 
97
  return response.text or ""
 
90
  raise RuntimeError(f"Variable d'environnement manquante : {_ENV_KEY}")
91
  client = self._build_client()
92
  image_part = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg")
93
+ try:
94
+ response = client.models.generate_content(
95
+ model=model_id,
96
+ contents=[image_part, prompt],
97
+ )
98
+ except Exception as exc:
99
+ logger.error(
100
+ "Appel API Vertex AI Γ©chouΓ©",
101
+ extra={"model": model_id, "error": str(exc)},
102
+ )
103
+ raise RuntimeError(f"Erreur API Vertex AI ({model_id}) : {exc}") from exc
104
  return response.text or ""
backend/app/services/corpus_runner.py CHANGED
@@ -17,7 +17,6 @@ from sqlalchemy import select
17
  # 3. local
18
  from app.models.database import async_session_factory
19
  from app.models.job import JobModel
20
- from app.services.job_runner import execute_page_job
21
 
22
  logger = logging.getLogger(__name__)
23
 
@@ -54,6 +53,8 @@ async def execute_corpus_job(corpus_id: str) -> dict:
54
  )
55
 
56
  # ExΓ©cution sΓ©quentielle β€” chaque job gΓ¨re sa propre session
 
 
57
  for job_id in job_ids:
58
  await execute_page_job(job_id)
59
 
 
17
  # 3. local
18
  from app.models.database import async_session_factory
19
  from app.models.job import JobModel
 
20
 
21
  logger = logging.getLogger(__name__)
22
 
 
53
  )
54
 
55
  # ExΓ©cution sΓ©quentielle β€” chaque job gΓ¨re sa propre session
56
+ from app.services.job_runner import execute_page_job
57
+
58
  for job_id in job_ids:
59
  await execute_page_job(job_id)
60
 
backend/app/services/export/alto.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
  GΓ©nΓ©rateur ALTO v4 depuis un PageMaster validΓ© (R02).
3
 
4
- Source canonique : PageMaster uniquement β€” jamais la rΓ©ponse brute gemini_raw.json.
5
  bbox [x, y, width, height] β†’ HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
6
 
7
  Mapping RegionType β†’ Γ©lΓ©ment ALTO :
@@ -82,7 +82,7 @@ def _build_text_block(
82
  text = fallback_text
83
 
84
  if not text:
85
- return # TextBlock vide β€” valide ALTO
86
 
87
  x, y, w, h = region.bbox
88
  line_el = etree.SubElement(
@@ -160,11 +160,7 @@ def generate_alto(master: PageMaster) -> str:
160
  etree.SubElement(desc, _a("MeasurementUnit")).text = "pixel"
161
 
162
  src_info = etree.SubElement(desc, _a("sourceImageInformation"))
163
- file_name = (
164
- master.image.get("original_url")
165
- or master.image.get("derivative_web")
166
- or master.page_id
167
- )
168
  etree.SubElement(src_info, _a("fileName")).text = str(file_name)
169
 
170
  if master.processing:
@@ -185,8 +181,8 @@ def generate_alto(master: PageMaster) -> str:
185
  # ── Layout ─────────────────────────────────────────────────────────────
186
  layout_el = etree.SubElement(root, _a("Layout"))
187
 
188
- width = int(master.image.get("width", 0))
189
- height = int(master.image.get("height", 0))
190
 
191
  page_id_safe = master.page_id.replace(" ", "_")
192
  page_el = etree.SubElement(
 
1
  """
2
  GΓ©nΓ©rateur ALTO v4 depuis un PageMaster validΓ© (R02).
3
 
4
+ Source canonique : PageMaster uniquement β€” jamais la rΓ©ponse brute ai_raw.json.
5
  bbox [x, y, width, height] β†’ HPOS / VPOS / WIDTH / HEIGHT (correspondance directe).
6
 
7
  Mapping RegionType β†’ Γ©lΓ©ment ALTO :
 
82
  text = fallback_text
83
 
84
  if not text:
85
+ return # TextBlock sans TextLine β€” valide ALTO, rΓ©gion visible dans le layout
86
 
87
  x, y, w, h = region.bbox
88
  line_el = etree.SubElement(
 
160
  etree.SubElement(desc, _a("MeasurementUnit")).text = "pixel"
161
 
162
  src_info = etree.SubElement(desc, _a("sourceImageInformation"))
163
+ file_name = master.image.master or master.image.derivative_web or master.page_id
 
 
 
 
164
  etree.SubElement(src_info, _a("fileName")).text = str(file_name)
165
 
166
  if master.processing:
 
181
  # ── Layout ─────────────────────────────────────────────────────────────
182
  layout_el = etree.SubElement(root, _a("Layout"))
183
 
184
+ width = master.image.width
185
+ height = master.image.height
186
 
187
  page_id_safe = master.page_id.replace(" ", "_")
188
  page_el = etree.SubElement(
backend/app/services/export/iiif.py CHANGED
@@ -74,7 +74,7 @@ def generate_manifest(
74
 
75
  manuscript_id = manuscript_meta["manuscript_id"]
76
  label = manuscript_meta["label"]
77
- language = manuscript_meta.get("language") or "none"
78
 
79
  # Pages dans l'ordre de sΓ©quence (rΓ¨gle absolue β€” structMap PHYSICAL)
80
  pages = sorted(masters, key=lambda m: m.sequence)
@@ -102,17 +102,17 @@ def generate_manifest(
102
  canvas_id = (
103
  f"{base_url}/api/v1/manuscripts/{manuscript_id}/canvas/{page.page_id}"
104
  )
105
- width = int(page.image.get("width", 0))
106
- height = int(page.image.get("height", 0))
107
 
108
  annotation_page_id = f"{canvas_id}/annotation-page/1"
109
  annotation_id = f"{canvas_id}/annotation/painting"
110
- image_url = page.image.get("original_url", "")
111
 
112
  canvas: dict = {
113
  "id": canvas_id,
114
  "type": "Canvas",
115
- "label": {"none": [f"Folio {page.folio_label}"]},
116
  "width": width,
117
  "height": height,
118
  "items": [
 
74
 
75
  manuscript_id = manuscript_meta["manuscript_id"]
76
  label = manuscript_meta["label"]
77
+ language = manuscript_meta.get("language") or "en"
78
 
79
  # Pages dans l'ordre de sΓ©quence (rΓ¨gle absolue β€” structMap PHYSICAL)
80
  pages = sorted(masters, key=lambda m: m.sequence)
 
102
  canvas_id = (
103
  f"{base_url}/api/v1/manuscripts/{manuscript_id}/canvas/{page.page_id}"
104
  )
105
+ width = page.image.width
106
+ height = page.image.height
107
 
108
  annotation_page_id = f"{canvas_id}/annotation-page/1"
109
  annotation_id = f"{canvas_id}/annotation/painting"
110
+ image_url = page.image.master or ""
111
 
112
  canvas: dict = {
113
  "id": canvas_id,
114
  "type": "Canvas",
115
+ "label": {language: [f"Folio {page.folio_label}"]},
116
  "width": width,
117
  "height": height,
118
  "items": [
backend/app/services/export/mets.py CHANGED
@@ -182,7 +182,7 @@ def generate_mets(
182
  f_master = _el(grp_master, f"{_M}file", {"ID": f"IMG_MASTER_{sid}", "MIMETYPE": "image/jpeg"})
183
  _el(f_master, f"{_M}FLocat", {
184
  "LOCTYPE": "URL",
185
- f"{_XL}href": page.image.get("original_url", ""),
186
  f"{_XL}type": "simple",
187
  })
188
 
@@ -191,12 +191,17 @@ def generate_mets(
191
  _el(f_deriv, f"{_M}FLocat", {
192
  "LOCTYPE": "OTHER",
193
  "OTHERLOCTYPE": "filepath",
194
- f"{_XL}href": page.image.get("derivative_web", ""),
195
  f"{_XL}type": "simple",
196
  })
197
 
198
- # ALTO
199
  alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
 
 
 
 
 
200
  f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
201
  _el(f_alto, f"{_M}FLocat", {
202
  "LOCTYPE": "OTHER",
 
182
  f_master = _el(grp_master, f"{_M}file", {"ID": f"IMG_MASTER_{sid}", "MIMETYPE": "image/jpeg"})
183
  _el(f_master, f"{_M}FLocat", {
184
  "LOCTYPE": "URL",
185
+ f"{_XL}href": page.image.master or "",
186
  f"{_XL}type": "simple",
187
  })
188
 
 
191
  _el(f_deriv, f"{_M}FLocat", {
192
  "LOCTYPE": "OTHER",
193
  "OTHERLOCTYPE": "filepath",
194
+ f"{_XL}href": page.image.derivative_web or "",
195
  f"{_XL}type": "simple",
196
  })
197
 
198
+ # ALTO (rΓ©fΓ©rence conditionnelle β€” warning si le fichier n'existe pas encore)
199
  alto_p = _alto_path(corpus_slug, page.folio_label, base_data_dir)
200
+ if not Path(alto_p).exists():
201
+ logger.warning(
202
+ "Fichier ALTO absent β€” la rΓ©fΓ©rence METS sera cassΓ©e tant que l'ALTO n'est pas gΓ©nΓ©rΓ©",
203
+ extra={"alto_path": alto_p, "page_id": page.page_id},
204
+ )
205
  f_alto = _el(grp_alto, f"{_M}file", {"ID": f"ALTO_{sid}", "MIMETYPE": "text/xml"})
206
  _el(f_alto, f"{_M}FLocat", {
207
  "LOCTYPE": "OTHER",
backend/app/services/ingest/iiif_fetcher.py CHANGED
@@ -17,7 +17,6 @@ _HEADERS = {
17
  "+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
18
  ),
19
  "Accept": "image/jpeg,image/png,image/*,*/*",
20
- "Referer": "https://gallica.bnf.fr/",
21
  }
22
 
23
 
 
17
  "+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
18
  ),
19
  "Accept": "image/jpeg,image/png,image/*,*/*",
 
20
  }
21
 
22
 
backend/app/services/job_runner.py CHANGED
@@ -33,8 +33,6 @@ from app.models.job import JobModel
33
  from app.models.model_config_db import ModelConfigDB
34
  from app.schemas.corpus_profile import CorpusProfile
35
  from app.schemas.model_config import ModelConfig, ProviderType
36
- from app.services.ai.analyzer import run_primary_analysis
37
- from app.services.export.alto import generate_alto, write_alto
38
  from app.services.image.normalizer import create_derivatives, fetch_and_normalize
39
 
40
  logger = logging.getLogger(__name__)
@@ -148,6 +146,8 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
148
  )
149
 
150
  # ── 6. Analyse primaire IA (R05 : double stockage) ───────────────────
 
 
151
  page_master = run_primary_analysis(
152
  derivative_image_path=Path(image_info.derivative_path),
153
  corpus_profile=corpus_profile,
@@ -163,6 +163,8 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
163
  )
164
 
165
  # ── 7. GΓ©nΓ©rer et Γ©crire l'ALTO XML ──────────────────────────────────
 
 
166
  alto_xml = generate_alto(page_master)
167
  alto_path = (
168
  data_dir
 
33
  from app.models.model_config_db import ModelConfigDB
34
  from app.schemas.corpus_profile import CorpusProfile
35
  from app.schemas.model_config import ModelConfig, ProviderType
 
 
36
  from app.services.image.normalizer import create_derivatives, fetch_and_normalize
37
 
38
  logger = logging.getLogger(__name__)
 
146
  )
147
 
148
  # ── 6. Analyse primaire IA (R05 : double stockage) ───────────────────
149
+ from app.services.ai.analyzer import run_primary_analysis
150
+
151
  page_master = run_primary_analysis(
152
  derivative_image_path=Path(image_info.derivative_path),
153
  corpus_profile=corpus_profile,
 
163
  )
164
 
165
  # ── 7. GΓ©nΓ©rer et Γ©crire l'ALTO XML ──────────────────────────────────
166
+ from app.services.export.alto import generate_alto, write_alto
167
+
168
  alto_xml = generate_alto(page_master)
169
  alto_path = (
170
  data_dir
backend/pyproject.toml CHANGED
@@ -11,6 +11,7 @@ dependencies = [
11
  "fastapi>=0.111",
12
  "uvicorn[standard]>=0.29",
13
  "pydantic>=2.7",
 
14
  "sqlalchemy>=2.0",
15
  "aiosqlite>=0.20",
16
  "google-genai>=1.0",
 
11
  "fastapi>=0.111",
12
  "uvicorn[standard]>=0.29",
13
  "pydantic>=2.7",
14
+ "pydantic-settings>=2.0",
15
  "sqlalchemy>=2.0",
16
  "aiosqlite>=0.20",
17
  "google-genai>=1.0",
backend/tests/conftest_api.py CHANGED
@@ -51,10 +51,11 @@ async def async_client(db_session: AsyncSession):
51
 
52
  app.dependency_overrides[get_db] = _override_get_db
53
  # Les background tasks (execute_corpus_job, execute_page_job) crΓ©ent leur
54
- # propre session via async_session_factory. On les neutralise pour Γ©viter
55
- # qu'elles tentent de se connecter Γ  la BDD rΓ©elle pendant les tests d'API.
56
- with patch("app.api.v1.jobs.execute_corpus_job", AsyncMock(return_value=None)), \
57
- patch("app.api.v1.jobs.execute_page_job", AsyncMock(return_value=None)):
 
58
  async with AsyncClient(
59
  transport=ASGITransport(app=app), base_url="http://test"
60
  ) as client:
 
51
 
52
  app.dependency_overrides[get_db] = _override_get_db
53
  # Les background tasks (execute_corpus_job, execute_page_job) crΓ©ent leur
54
+ # propre session via async_session_factory. On les neutralise en mockant
55
+ # les modules sources pour Γ©viter qu'elles tentent de se connecter Γ  la
56
+ # BDD rΓ©elle pendant les tests d'API.
57
+ with patch("app.services.corpus_runner.execute_corpus_job", AsyncMock(return_value={"total": 0, "done": 0, "failed": 0})), \
58
+ patch("app.services.job_runner.execute_page_job", AsyncMock(return_value=None)):
59
  async with AsyncClient(
60
  transport=ASGITransport(app=app), base_url="http://test"
61
  ) as client:
backend/tests/test_ai_analyzer.py CHANGED
@@ -3,7 +3,7 @@ Tests du pipeline d'analyse IA :
3
  - prompt_loader : chargement + rendu des templates
4
  - client_factory : construction du genai.Client selon le provider
5
  - response_parser: parsing JSON brut β†’ layout + OCRResult
6
- - master_writer : Γ©criture gemini_raw.json et master.json
7
  - analyzer : run_primary_analysis (end-to-end mockΓ©)
8
  """
9
  # 1. stdlib
@@ -31,7 +31,7 @@ from app.schemas.model_config import ModelConfig, ProviderType
31
  from app.schemas.page_master import OCRResult, PageMaster
32
  from app.services.ai.analyzer import run_primary_analysis
33
  from app.services.ai.client_factory import build_client
34
- from app.services.ai.master_writer import write_gemini_raw, write_master_json
35
  from app.services.ai.prompt_loader import load_and_render_prompt
36
  from app.services.ai.response_parser import ParseError, parse_ai_response
37
 
@@ -390,35 +390,35 @@ def test_parse_empty_regions_list():
390
 
391
 
392
  # ---------------------------------------------------------------------------
393
- # Tests β€” write_gemini_raw / write_master_json
394
  # ---------------------------------------------------------------------------
395
 
396
- def test_write_gemini_raw_creates_file(tmp_path):
397
- out = tmp_path / "page" / "gemini_raw.json"
398
- write_gemini_raw("raw AI text here", out)
399
 
400
  assert out.exists()
401
 
402
 
403
- def test_write_gemini_raw_valid_json(tmp_path):
404
- out = tmp_path / "gemini_raw.json"
405
- write_gemini_raw('{"not": "valid json from AI"}', out)
406
 
407
  content = json.loads(out.read_text(encoding="utf-8"))
408
  assert "response_text" in content
409
  assert content["response_text"] == '{"not": "valid json from AI"}'
410
 
411
 
412
- def test_write_gemini_raw_creates_parent_dirs(tmp_path):
413
- out = tmp_path / "deep" / "nested" / "dir" / "gemini_raw.json"
414
- write_gemini_raw("text", out)
415
  assert out.exists()
416
 
417
 
418
- def test_write_gemini_raw_with_non_json_text(tmp_path):
419
- """MΓͺme si le texte brut est invalide, gemini_raw.json est créé."""
420
- out = tmp_path / "gemini_raw.json"
421
- write_gemini_raw("this is not json at all", out)
422
 
423
  content = json.loads(out.read_text(encoding="utf-8"))
424
  assert content["response_text"] == "this is not json at all"
@@ -432,7 +432,7 @@ def _make_page_master() -> PageMaster:
432
  folio_label="0001r",
433
  sequence=1,
434
  image={
435
- "original_url": "https://example.com/img.jpg",
436
  "derivative_web": "/data/deriv.jpg",
437
  "thumbnail": "/data/thumb.jpg",
438
  "width": 1500,
@@ -440,10 +440,11 @@ def _make_page_master() -> PageMaster:
440
  },
441
  layout={"regions": []},
442
  processing={
 
443
  "model_id": "gemini-2.0-flash",
444
  "model_display_name": "Gemini 2.0 Flash",
445
  "prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
446
- "raw_response_path": "/data/gemini_raw.json",
447
  "processed_at": datetime.now(tz=timezone.utc),
448
  },
449
  )
@@ -568,12 +569,12 @@ def test_run_primary_analysis_files_created(tmp_path):
568
  )
569
 
570
  page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
571
- assert (page_dir / "gemini_raw.json").exists()
572
  assert (page_dir / "master.json").exists()
573
 
574
 
575
  def test_run_primary_analysis_raw_written_before_parse(tmp_path):
576
- """gemini_raw.json est Γ©crit AVANT que le parsing Γ©choue (R05)."""
577
  prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
578
  _setup_prompt_file(tmp_path, prompt_rel)
579
  deriv_path = _setup_derivative(tmp_path)
@@ -596,8 +597,8 @@ def test_run_primary_analysis_raw_written_before_parse(tmp_path):
596
  project_root=tmp_path,
597
  )
598
 
599
- # gemini_raw.json existe malgrΓ© l'Γ©chec de parsing
600
- raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "gemini_raw.json"
601
  assert raw_path.exists()
602
 
603
  # master.json N'existe PAS (parsing a οΏ½οΏ½chouΓ©)
@@ -663,9 +664,9 @@ def test_run_primary_analysis_image_dict(tmp_path):
663
  project_root=tmp_path,
664
  )
665
 
666
- assert result.image["original_url"] == image_info.original_url
667
- assert result.image["width"] == image_info.derivative_width
668
- assert result.image["height"] == image_info.derivative_height
669
 
670
 
671
  def test_run_primary_analysis_regions_in_layout(tmp_path):
 
3
  - prompt_loader : chargement + rendu des templates
4
  - client_factory : construction du genai.Client selon le provider
5
  - response_parser: parsing JSON brut β†’ layout + OCRResult
6
+ - master_writer : Γ©criture ai_raw.json et master.json
7
  - analyzer : run_primary_analysis (end-to-end mockΓ©)
8
  """
9
  # 1. stdlib
 
31
  from app.schemas.page_master import OCRResult, PageMaster
32
  from app.services.ai.analyzer import run_primary_analysis
33
  from app.services.ai.client_factory import build_client
34
+ from app.services.ai.master_writer import write_ai_raw, write_master_json
35
  from app.services.ai.prompt_loader import load_and_render_prompt
36
  from app.services.ai.response_parser import ParseError, parse_ai_response
37
 
 
390
 
391
 
392
  # ---------------------------------------------------------------------------
393
+ # Tests β€” write_ai_raw / write_master_json
394
  # ---------------------------------------------------------------------------
395
 
396
+ def test_write_ai_raw_creates_file(tmp_path):
397
+ out = tmp_path / "page" / "ai_raw.json"
398
+ write_ai_raw("raw AI text here", out)
399
 
400
  assert out.exists()
401
 
402
 
403
+ def test_write_ai_raw_valid_json(tmp_path):
404
+ out = tmp_path / "ai_raw.json"
405
+ write_ai_raw('{"not": "valid json from AI"}', out)
406
 
407
  content = json.loads(out.read_text(encoding="utf-8"))
408
  assert "response_text" in content
409
  assert content["response_text"] == '{"not": "valid json from AI"}'
410
 
411
 
412
+ def test_write_ai_raw_creates_parent_dirs(tmp_path):
413
+ out = tmp_path / "deep" / "nested" / "dir" / "ai_raw.json"
414
+ write_ai_raw("text", out)
415
  assert out.exists()
416
 
417
 
418
+ def test_write_ai_raw_with_non_json_text(tmp_path):
419
+ """MΓͺme si le texte brut est invalide, ai_raw.json est créé."""
420
+ out = tmp_path / "ai_raw.json"
421
+ write_ai_raw("this is not json at all", out)
422
 
423
  content = json.loads(out.read_text(encoding="utf-8"))
424
  assert content["response_text"] == "this is not json at all"
 
432
  folio_label="0001r",
433
  sequence=1,
434
  image={
435
+ "master": "https://example.com/img.jpg",
436
  "derivative_web": "/data/deriv.jpg",
437
  "thumbnail": "/data/thumb.jpg",
438
  "width": 1500,
 
440
  },
441
  layout={"regions": []},
442
  processing={
443
+ "provider": "google_ai_studio",
444
  "model_id": "gemini-2.0-flash",
445
  "model_display_name": "Gemini 2.0 Flash",
446
  "prompt_version": "prompts/medieval-illuminated/primary_v1.txt",
447
+ "raw_response_path": "/data/ai_raw.json",
448
  "processed_at": datetime.now(tz=timezone.utc),
449
  },
450
  )
 
569
  )
570
 
571
  page_dir = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r"
572
+ assert (page_dir / "ai_raw.json").exists()
573
  assert (page_dir / "master.json").exists()
574
 
575
 
576
  def test_run_primary_analysis_raw_written_before_parse(tmp_path):
577
+ """ai_raw.json est Γ©crit AVANT que le parsing Γ©choue (R05)."""
578
  prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
579
  _setup_prompt_file(tmp_path, prompt_rel)
580
  deriv_path = _setup_derivative(tmp_path)
 
597
  project_root=tmp_path,
598
  )
599
 
600
+ # ai_raw.json existe malgrΓ© l'Γ©chec de parsing
601
+ raw_path = tmp_path / "data" / "corpora" / "test-corpus" / "pages" / "0001r" / "ai_raw.json"
602
  assert raw_path.exists()
603
 
604
  # master.json N'existe PAS (parsing a οΏ½οΏ½chouΓ©)
 
664
  project_root=tmp_path,
665
  )
666
 
667
+ assert result.image.master == image_info.original_url
668
+ assert result.image.width == image_info.derivative_width
669
+ assert result.image.height == image_info.derivative_height
670
 
671
 
672
  def test_run_primary_analysis_regions_in_layout(tmp_path):
backend/tests/test_api_corrections.py CHANGED
@@ -75,7 +75,7 @@ def _make_master(
75
  "manuscript_id": "ms-test",
76
  "folio_label": "f001r",
77
  "sequence": 1,
78
- "image": {"original_url": "https://example.com/f.jpg", "width": 1500, "height": 2000},
79
  "layout": {"regions": []},
80
  "ocr": {
81
  "diplomatic_text": "Incipit liber primus",
@@ -238,13 +238,13 @@ async def test_corrections_archives_old_version(async_client, db_session, monkey
238
  ms = await _create_manuscript(db_session, corpus.id)
239
  page = await _create_page(db_session, ms.id)
240
 
241
- written_paths: list[str] = []
242
 
243
  monkeypatch.setattr(Path, "exists", lambda self: True)
244
  monkeypatch.setattr(Path, "read_text", lambda self, **kw: _make_master(page.id, version=1))
245
 
246
  def _capture_write(self: Path, content: str, **kw: object) -> None:
247
- written_paths.append(str(self))
248
 
249
  monkeypatch.setattr(Path, "write_text", _capture_write)
250
 
@@ -254,10 +254,17 @@ async def test_corrections_archives_old_version(async_client, db_session, monkey
254
  )
255
 
256
  # Deux Γ©critures attendues : master_v1.json (archive) + master.json (nouveau)
 
257
  assert len(written_paths) >= 2
258
  assert any("master_v1.json" in p for p in written_paths)
259
  assert any("master.json" in p and "master_v" not in p for p in written_paths)
260
 
 
 
 
 
 
 
261
 
262
  @pytest.mark.asyncio
263
  async def test_corrections_multiple_fields(async_client, db_session, monkeypatch):
 
75
  "manuscript_id": "ms-test",
76
  "folio_label": "f001r",
77
  "sequence": 1,
78
+ "image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
79
  "layout": {"regions": []},
80
  "ocr": {
81
  "diplomatic_text": "Incipit liber primus",
 
238
  ms = await _create_manuscript(db_session, corpus.id)
239
  page = await _create_page(db_session, ms.id)
240
 
241
+ written_data: dict[str, str] = {}
242
 
243
  monkeypatch.setattr(Path, "exists", lambda self: True)
244
  monkeypatch.setattr(Path, "read_text", lambda self, **kw: _make_master(page.id, version=1))
245
 
246
  def _capture_write(self: Path, content: str, **kw: object) -> None:
247
+ written_data[str(self)] = content
248
 
249
  monkeypatch.setattr(Path, "write_text", _capture_write)
250
 
 
254
  )
255
 
256
  # Deux Γ©critures attendues : master_v1.json (archive) + master.json (nouveau)
257
+ written_paths = list(written_data.keys())
258
  assert len(written_paths) >= 2
259
  assert any("master_v1.json" in p for p in written_paths)
260
  assert any("master.json" in p and "master_v" not in p for p in written_paths)
261
 
262
+ # VΓ©rifier que l'archive contient bien la version originale (v1)
263
+ import json as _json
264
+ archive_path = next(p for p in written_paths if "master_v1.json" in p)
265
+ archive_data = _json.loads(written_data[archive_path])
266
+ assert archive_data["editorial"]["version"] == 1
267
+
268
 
269
  @pytest.mark.asyncio
270
  async def test_corrections_multiple_fields(async_client, db_session, monkeypatch):
backend/tests/test_api_export.py CHANGED
@@ -83,7 +83,7 @@ def _make_master_json(page_id: str, folio_label: str, sequence: int) -> str:
83
  "folio_label": folio_label,
84
  "sequence": sequence,
85
  "image": {
86
- "original_url": f"https://example.com/{page_id}.jpg",
87
  "derivative_web": f"/data/deriv/{page_id}.jpg",
88
  "thumbnail": f"/data/thumb/{page_id}.jpg",
89
  "width": 1500,
 
83
  "folio_label": folio_label,
84
  "sequence": sequence,
85
  "image": {
86
+ "master": f"https://example.com/{page_id}.jpg",
87
  "derivative_web": f"/data/deriv/{page_id}.jpg",
88
  "thumbnail": f"/data/thumb/{page_id}.jpg",
89
  "width": 1500,
backend/tests/test_api_ingest.py CHANGED
@@ -457,6 +457,15 @@ async def test_reingest_manifest_skips_existing_pages(async_client, db_session,
457
  assert data2["pages_created"] == 0
458
  assert data2["pages_skipped"] == 2
459
 
 
 
 
 
 
 
 
 
 
460
 
461
  @pytest.mark.asyncio
462
  async def test_reingest_images_skips_existing_pages(async_client, db_session):
 
457
  assert data2["pages_created"] == 0
458
  assert data2["pages_skipped"] == 2
459
 
460
+ # VΓ©rifier que la BDD n'a bien que 2 pages (pas de doublons)
461
+ from sqlalchemy import select as sa_select
462
+ from app.models.corpus import PageModel
463
+ page_result = await db_session.execute(
464
+ sa_select(PageModel).where(PageModel.manuscript_id == data1["manuscript_id"])
465
+ )
466
+ pages_in_db = list(page_result.scalars().all())
467
+ assert len(pages_in_db) == 2
468
+
469
 
470
  @pytest.mark.asyncio
471
  async def test_reingest_images_skips_existing_pages(async_client, db_session):
backend/tests/test_api_models.py CHANGED
@@ -94,7 +94,7 @@ async def test_get_models_endpoint_removed(async_client):
94
  @pytest.mark.asyncio
95
  async def test_refresh_models_ok(async_client, monkeypatch):
96
  monkeypatch.setattr(
97
- models_api_module, "list_all_models", lambda: _MOCK_MODELS
98
  )
99
  response = await async_client.post("/api/v1/models/refresh")
100
  assert response.status_code == 200
@@ -103,7 +103,7 @@ async def test_refresh_models_ok(async_client, monkeypatch):
103
  @pytest.mark.asyncio
104
  async def test_refresh_models_has_timestamp(async_client, monkeypatch):
105
  monkeypatch.setattr(
106
- models_api_module, "list_all_models", lambda: _MOCK_MODELS
107
  )
108
  data = (await async_client.post("/api/v1/models/refresh")).json()
109
  assert "refreshed_at" in data
@@ -113,7 +113,7 @@ async def test_refresh_models_has_timestamp(async_client, monkeypatch):
113
  @pytest.mark.asyncio
114
  async def test_refresh_models_count(async_client, monkeypatch):
115
  monkeypatch.setattr(
116
- models_api_module, "list_all_models", lambda: _MOCK_MODELS
117
  )
118
  data = (await async_client.post("/api/v1/models/refresh")).json()
119
  assert data["count"] == 2
@@ -123,7 +123,7 @@ async def test_refresh_models_count(async_client, monkeypatch):
123
  @pytest.mark.asyncio
124
  async def test_refresh_models_structure(async_client, monkeypatch):
125
  monkeypatch.setattr(
126
- models_api_module, "list_all_models", lambda: _MOCK_MODELS
127
  )
128
  data = (await async_client.post("/api/v1/models/refresh")).json()
129
  assert "models" in data
 
94
  @pytest.mark.asyncio
95
  async def test_refresh_models_ok(async_client, monkeypatch):
96
  monkeypatch.setattr(
97
+ "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
98
  )
99
  response = await async_client.post("/api/v1/models/refresh")
100
  assert response.status_code == 200
 
103
  @pytest.mark.asyncio
104
  async def test_refresh_models_has_timestamp(async_client, monkeypatch):
105
  monkeypatch.setattr(
106
+ "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
107
  )
108
  data = (await async_client.post("/api/v1/models/refresh")).json()
109
  assert "refreshed_at" in data
 
113
  @pytest.mark.asyncio
114
  async def test_refresh_models_count(async_client, monkeypatch):
115
  monkeypatch.setattr(
116
+ "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
117
  )
118
  data = (await async_client.post("/api/v1/models/refresh")).json()
119
  assert data["count"] == 2
 
123
  @pytest.mark.asyncio
124
  async def test_refresh_models_structure(async_client, monkeypatch):
125
  monkeypatch.setattr(
126
+ "app.services.ai.model_registry.list_all_models", lambda: _MOCK_MODELS
127
  )
128
  data = (await async_client.post("/api/v1/models/refresh")).json()
129
  assert "models" in data
backend/tests/test_api_pages.py CHANGED
@@ -87,7 +87,7 @@ def _make_master_json(page_id: str, corpus_profile: str = "medieval-illuminated"
87
  "folio_label": "f001r",
88
  "sequence": 1,
89
  "image": {
90
- "original_url": "https://example.com/f001r.jpg",
91
  "derivative_web": "/data/deriv/f001r.jpg",
92
  "thumbnail": "/data/thumb/f001r.jpg",
93
  "width": 1500,
 
87
  "folio_label": "f001r",
88
  "sequence": 1,
89
  "image": {
90
+ "master": "https://example.com/f001r.jpg",
91
  "derivative_web": "/data/deriv/f001r.jpg",
92
  "thumbnail": "/data/thumb/f001r.jpg",
93
  "width": 1500,
backend/tests/test_api_providers.py CHANGED
@@ -90,7 +90,7 @@ _MOCK_MISTRAL_MODELS = [
90
 
91
  @pytest.mark.asyncio
92
  async def test_list_providers_returns_list(async_client, monkeypatch):
93
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
94
  resp = await async_client.get("/api/v1/providers")
95
  assert resp.status_code == 200
96
  assert isinstance(resp.json(), list)
@@ -98,14 +98,14 @@ async def test_list_providers_returns_list(async_client, monkeypatch):
98
 
99
  @pytest.mark.asyncio
100
  async def test_list_providers_count(async_client, monkeypatch):
101
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
102
  data = (await async_client.get("/api/v1/providers")).json()
103
  assert len(data) == 4 # 4 providers connus
104
 
105
 
106
  @pytest.mark.asyncio
107
  async def test_list_providers_fields(async_client, monkeypatch):
108
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
109
  data = (await async_client.get("/api/v1/providers")).json()
110
  p = data[0]
111
  assert "provider_type" in p
@@ -116,7 +116,7 @@ async def test_list_providers_fields(async_client, monkeypatch):
116
 
117
  @pytest.mark.asyncio
118
  async def test_list_providers_all_unavailable(async_client, monkeypatch):
119
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
120
  data = (await async_client.get("/api/v1/providers")).json()
121
  assert all(not p["available"] for p in data)
122
  assert all(p["model_count"] == 0 for p in data)
@@ -124,7 +124,7 @@ async def test_list_providers_all_unavailable(async_client, monkeypatch):
124
 
125
  @pytest.mark.asyncio
126
  async def test_list_providers_google_available(async_client, monkeypatch):
127
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_GOOGLE_ONLY)
128
  data = (await async_client.get("/api/v1/providers")).json()
129
  google = next(p for p in data if p["provider_type"] == "google_ai_studio")
130
  assert google["available"] is True
@@ -133,7 +133,7 @@ async def test_list_providers_google_available(async_client, monkeypatch):
133
 
134
  @pytest.mark.asyncio
135
  async def test_list_providers_mistral_available(async_client, monkeypatch):
136
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_GOOGLE_AND_MISTRAL)
137
  data = (await async_client.get("/api/v1/providers")).json()
138
  mistral = next(p for p in data if p["provider_type"] == "mistral")
139
  assert mistral["available"] is True
@@ -143,7 +143,7 @@ async def test_list_providers_mistral_available(async_client, monkeypatch):
143
  @pytest.mark.asyncio
144
  async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
145
  """Mistral est toujours dans la liste mΓͺme si indisponible."""
146
- monkeypatch.setattr(models_api_module, "get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
147
  data = (await async_client.get("/api/v1/providers")).json()
148
  types_ = [p["provider_type"] for p in data]
149
  assert "mistral" in types_
@@ -156,7 +156,7 @@ async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
156
  @pytest.mark.asyncio
157
  async def test_get_provider_models_google(async_client, monkeypatch):
158
  monkeypatch.setattr(
159
- models_api_module, "list_models_for_provider", lambda ptype: _MOCK_GOOGLE_MODELS
160
  )
161
  resp = await async_client.get("/api/v1/providers/google_ai_studio/models")
162
  assert resp.status_code == 200
@@ -166,7 +166,7 @@ async def test_get_provider_models_google(async_client, monkeypatch):
166
  @pytest.mark.asyncio
167
  async def test_get_provider_models_mistral(async_client, monkeypatch):
168
  monkeypatch.setattr(
169
- models_api_module, "list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
170
  )
171
  resp = await async_client.get("/api/v1/providers/mistral/models")
172
  assert resp.status_code == 200
@@ -189,7 +189,7 @@ async def test_get_provider_models_not_configured(async_client, monkeypatch):
189
  def _raise(ptype):
190
  raise RuntimeError("Variable d'environnement manquante : MISTRAL_API_KEY")
191
 
192
- monkeypatch.setattr(models_api_module, "list_models_for_provider", _raise)
193
  resp = await async_client.get("/api/v1/providers/mistral/models")
194
  assert resp.status_code == 503
195
 
@@ -197,7 +197,7 @@ async def test_get_provider_models_not_configured(async_client, monkeypatch):
197
  @pytest.mark.asyncio
198
  async def test_get_provider_models_fields(async_client, monkeypatch):
199
  monkeypatch.setattr(
200
- models_api_module, "list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
201
  )
202
  data = (await async_client.get("/api/v1/providers/mistral/models")).json()
203
  m = data[0]
 
90
 
91
  @pytest.mark.asyncio
92
  async def test_list_providers_returns_list(async_client, monkeypatch):
93
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
94
  resp = await async_client.get("/api/v1/providers")
95
  assert resp.status_code == 200
96
  assert isinstance(resp.json(), list)
 
98
 
99
  @pytest.mark.asyncio
100
  async def test_list_providers_count(async_client, monkeypatch):
101
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
102
  data = (await async_client.get("/api/v1/providers")).json()
103
  assert len(data) == 4 # 4 providers connus
104
 
105
 
106
  @pytest.mark.asyncio
107
  async def test_list_providers_fields(async_client, monkeypatch):
108
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
109
  data = (await async_client.get("/api/v1/providers")).json()
110
  p = data[0]
111
  assert "provider_type" in p
 
116
 
117
  @pytest.mark.asyncio
118
  async def test_list_providers_all_unavailable(async_client, monkeypatch):
119
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
120
  data = (await async_client.get("/api/v1/providers")).json()
121
  assert all(not p["available"] for p in data)
122
  assert all(p["model_count"] == 0 for p in data)
 
124
 
125
  @pytest.mark.asyncio
126
  async def test_list_providers_google_available(async_client, monkeypatch):
127
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_GOOGLE_ONLY)
128
  data = (await async_client.get("/api/v1/providers")).json()
129
  google = next(p for p in data if p["provider_type"] == "google_ai_studio")
130
  assert google["available"] is True
 
133
 
134
  @pytest.mark.asyncio
135
  async def test_list_providers_mistral_available(async_client, monkeypatch):
136
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_GOOGLE_AND_MISTRAL)
137
  data = (await async_client.get("/api/v1/providers")).json()
138
  mistral = next(p for p in data if p["provider_type"] == "mistral")
139
  assert mistral["available"] is True
 
143
  @pytest.mark.asyncio
144
  async def test_list_providers_includes_mistral_type(async_client, monkeypatch):
145
  """Mistral est toujours dans la liste mΓͺme si indisponible."""
146
+ monkeypatch.setattr("app.services.ai.model_registry.get_available_providers", lambda: _PROVIDERS_ALL_UNAVAILABLE)
147
  data = (await async_client.get("/api/v1/providers")).json()
148
  types_ = [p["provider_type"] for p in data]
149
  assert "mistral" in types_
 
156
  @pytest.mark.asyncio
157
  async def test_get_provider_models_google(async_client, monkeypatch):
158
  monkeypatch.setattr(
159
+ "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_GOOGLE_MODELS
160
  )
161
  resp = await async_client.get("/api/v1/providers/google_ai_studio/models")
162
  assert resp.status_code == 200
 
166
  @pytest.mark.asyncio
167
  async def test_get_provider_models_mistral(async_client, monkeypatch):
168
  monkeypatch.setattr(
169
+ "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
170
  )
171
  resp = await async_client.get("/api/v1/providers/mistral/models")
172
  assert resp.status_code == 200
 
189
  def _raise(ptype):
190
  raise RuntimeError("Variable d'environnement manquante : MISTRAL_API_KEY")
191
 
192
+ monkeypatch.setattr("app.services.ai.model_registry.list_models_for_provider", _raise)
193
  resp = await async_client.get("/api/v1/providers/mistral/models")
194
  assert resp.status_code == 503
195
 
 
197
  @pytest.mark.asyncio
198
  async def test_get_provider_models_fields(async_client, monkeypatch):
199
  monkeypatch.setattr(
200
+ "app.services.ai.model_registry.list_models_for_provider", lambda ptype: _MOCK_MISTRAL_MODELS
201
  )
202
  data = (await async_client.get("/api/v1/providers/mistral/models")).json()
203
  m = data[0]
backend/tests/test_api_search.py CHANGED
@@ -33,7 +33,7 @@ def _make_master(page_id: str, diplomatic_text: str = "", translation_fr: str =
33
  "manuscript_id": "ms-test",
34
  "folio_label": "f001r",
35
  "sequence": 1,
36
- "image": {"original_url": "https://example.com/f.jpg", "width": 1500, "height": 2000},
37
  "layout": {"regions": []},
38
  "ocr": {
39
  "diplomatic_text": diplomatic_text,
 
33
  "manuscript_id": "ms-test",
34
  "folio_label": "f001r",
35
  "sequence": 1,
36
+ "image": {"master": "https://example.com/f.jpg", "width": 1500, "height": 2000},
37
  "layout": {"regions": []},
38
  "ocr": {
39
  "diplomatic_text": diplomatic_text,
backend/tests/test_export_alto.py CHANGED
@@ -52,10 +52,11 @@ def _make_master(
52
  processing = None
53
  if with_processing:
54
  processing = ProcessingInfo(
 
55
  model_id="gemini-2.0-flash",
56
  model_display_name="Gemini 2.0 Flash",
57
  prompt_version="prompts/medieval-illuminated/primary_v1.txt",
58
- raw_response_path="/data/gemini_raw.json",
59
  processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
60
  )
61
  return PageMaster(
@@ -65,7 +66,7 @@ def _make_master(
65
  folio_label="0001r",
66
  sequence=sequence,
67
  image={
68
- "original_url": "https://example.com/img.jpg",
69
  "derivative_web": "/data/deriv.jpg",
70
  "thumbnail": "/data/thumb.jpg",
71
  "width": width,
 
52
  processing = None
53
  if with_processing:
54
  processing = ProcessingInfo(
55
+ provider="google_ai_studio",
56
  model_id="gemini-2.0-flash",
57
  model_display_name="Gemini 2.0 Flash",
58
  prompt_version="prompts/medieval-illuminated/primary_v1.txt",
59
+ raw_response_path="/data/ai_raw.json",
60
  processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
61
  )
62
  return PageMaster(
 
66
  folio_label="0001r",
67
  sequence=sequence,
68
  image={
69
+ "master": "https://example.com/img.jpg",
70
  "derivative_web": "/data/deriv.jpg",
71
  "thumbnail": "/data/thumb.jpg",
72
  "width": width,
backend/tests/test_export_iiif.py CHANGED
@@ -53,7 +53,7 @@ def _make_page(
53
  folio_label=folio_label,
54
  sequence=sequence,
55
  image={
56
- "original_url": original_url or f"https://example.com/{folio_label}.jpg",
57
  "derivative_web": f"/data/deriv/{folio_label}.jpg",
58
  "thumbnail": f"/data/thumb/{folio_label}.jpg",
59
  "width": width,
@@ -193,11 +193,11 @@ def test_manifest_label_uses_language_key(simple_manifest):
193
 
194
 
195
  def test_manifest_label_without_language_uses_none():
196
- """Sans champ language, la clΓ© de label est 'none'."""
197
  pages = [_make_page("ms-0001r", "0001r", 1)]
198
- meta = _base_meta() # pas de language
199
  manifest = generate_manifest(pages, meta, _BASE_URL)
200
- assert "none" in manifest["label"]
201
 
202
 
203
  def test_manifest_label_fr(chroniques_pages, chroniques_meta):
@@ -272,7 +272,7 @@ def test_canvas_order_respects_sequence():
272
  _make_page("ms-f002r", "f002r", 2),
273
  ]
274
  manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
275
- labels = [c["label"]["none"][0] for c in manifest["items"]]
276
  assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
277
 
278
 
@@ -283,7 +283,7 @@ def test_canvas_order_large_sequence():
283
  random.shuffle(pages)
284
  manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
285
  sequences_in_label = [
286
- int(c["label"]["none"][0].replace("Folio f", "").replace("r", ""))
287
  for c in manifest["items"]
288
  ]
289
  assert sequences_in_label == list(range(1, 11))
@@ -344,7 +344,7 @@ def test_canvas_width_matches_image(beatus_pages, beatus_meta):
344
  # Trouve la page correspondante
345
  page_id = canvas["id"].split("/canvas/")[-1]
346
  page = next(p for p in beatus_pages if p.page_id == page_id)
347
- assert canvas["width"] == page.image["width"]
348
 
349
 
350
  def test_canvas_height_matches_image(beatus_pages, beatus_meta):
@@ -352,7 +352,7 @@ def test_canvas_height_matches_image(beatus_pages, beatus_meta):
352
  for canvas in manifest["items"]:
353
  page_id = canvas["id"].split("/canvas/")[-1]
354
  page = next(p for p in beatus_pages if p.page_id == page_id)
355
- assert canvas["height"] == page.image["height"]
356
 
357
 
358
  def test_canvas_dimensions_beatus_hr():
@@ -447,7 +447,7 @@ def test_annotation_body_id_is_original_url(beatus_pages, beatus_meta):
447
  page_id = canvas["id"].split("/canvas/")[-1]
448
  page = next(p for p in beatus_pages if p.page_id == page_id)
449
  body = canvas["items"][0]["items"][0]["body"]
450
- assert body["id"] == page.image["original_url"]
451
 
452
 
453
  def test_annotation_body_contains_gallica_url(beatus_pages, beatus_meta):
@@ -480,7 +480,10 @@ def test_base_url_trailing_slash_stripped():
480
  """Un base_url avec slash final ne gΓ©nΓ¨re pas de double slash dans les IDs."""
481
  pages = [_make_page("ms-0001r", "0001r", 1)]
482
  manifest = generate_manifest(pages, _base_meta(), "https://example.com/")
483
- assert "//" not in manifest["id"].replace("://", "X")
 
 
 
484
 
485
 
486
  # ---------------------------------------------------------------------------
 
53
  folio_label=folio_label,
54
  sequence=sequence,
55
  image={
56
+ "master": original_url or f"https://example.com/{folio_label}.jpg",
57
  "derivative_web": f"/data/deriv/{folio_label}.jpg",
58
  "thumbnail": f"/data/thumb/{folio_label}.jpg",
59
  "width": width,
 
193
 
194
 
195
  def test_manifest_label_without_language_uses_none():
196
+ """Sans champ language, la clΓ© de label est 'en' (dΓ©faut IIIF-compliant)."""
197
  pages = [_make_page("ms-0001r", "0001r", 1)]
198
+ meta = _base_meta() # pas de language β†’ dΓ©faut "en"
199
  manifest = generate_manifest(pages, meta, _BASE_URL)
200
+ assert "en" in manifest["label"]
201
 
202
 
203
  def test_manifest_label_fr(chroniques_pages, chroniques_meta):
 
272
  _make_page("ms-f002r", "f002r", 2),
273
  ]
274
  manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
275
+ labels = [c["label"]["en"][0] for c in manifest["items"]]
276
  assert labels == ["Folio f001r", "Folio f002r", "Folio f003r"]
277
 
278
 
 
283
  random.shuffle(pages)
284
  manifest = generate_manifest(pages, _base_meta(), _BASE_URL)
285
  sequences_in_label = [
286
+ int(c["label"]["en"][0].replace("Folio f", "").replace("r", ""))
287
  for c in manifest["items"]
288
  ]
289
  assert sequences_in_label == list(range(1, 11))
 
344
  # Trouve la page correspondante
345
  page_id = canvas["id"].split("/canvas/")[-1]
346
  page = next(p for p in beatus_pages if p.page_id == page_id)
347
+ assert canvas["width"] == page.image.width
348
 
349
 
350
  def test_canvas_height_matches_image(beatus_pages, beatus_meta):
 
352
  for canvas in manifest["items"]:
353
  page_id = canvas["id"].split("/canvas/")[-1]
354
  page = next(p for p in beatus_pages if p.page_id == page_id)
355
+ assert canvas["height"] == page.image.height
356
 
357
 
358
  def test_canvas_dimensions_beatus_hr():
 
447
  page_id = canvas["id"].split("/canvas/")[-1]
448
  page = next(p for p in beatus_pages if p.page_id == page_id)
449
  body = canvas["items"][0]["items"][0]["body"]
450
+ assert body["id"] == page.image.master
451
 
452
 
453
  def test_annotation_body_contains_gallica_url(beatus_pages, beatus_meta):
 
480
  """Un base_url avec slash final ne gΓ©nΓ¨re pas de double slash dans les IDs."""
481
  pages = [_make_page("ms-0001r", "0001r", 1)]
482
  manifest = generate_manifest(pages, _base_meta(), "https://example.com/")
483
+ manifest_id = manifest["id"]
484
+ # Retirer le protocole puis vΓ©rifier qu'il n'y a pas de double slash
485
+ without_protocol = manifest_id.split("://", 1)[1]
486
+ assert "//" not in without_protocol
487
 
488
 
489
  # ---------------------------------------------------------------------------
backend/tests/test_export_mets.py CHANGED
@@ -66,10 +66,11 @@ def _make_page(
66
  processing = None
67
  if with_processing:
68
  processing = ProcessingInfo(
 
69
  model_id="gemini-2.0-flash",
70
  model_display_name="Gemini 2.0 Flash",
71
  prompt_version="prompts/medieval-illuminated/primary_v1.txt",
72
- raw_response_path=f"/data/corpora/test/pages/{folio_label}/gemini_raw.json",
73
  processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
74
  )
75
  ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
@@ -80,7 +81,7 @@ def _make_page(
80
  folio_label=folio_label,
81
  sequence=sequence,
82
  image={
83
- "original_url": original_url or f"https://example.com/{folio_label}.jpg",
84
  "derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg",
85
  "thumbnail": f"/data/thumb/{folio_label}.jpg",
86
  "width": 1500,
@@ -194,7 +195,9 @@ def test_generate_mets_namespace(beatus_pages, beatus_meta):
194
 
195
  def test_generate_mets_objid(beatus_pages, beatus_meta):
196
  root = _parse(generate_mets(beatus_pages, beatus_meta))
197
- assert root.get("OBJID") == "BnF-Latin-8878"
 
 
198
 
199
 
200
  def test_generate_mets_label(beatus_pages, beatus_meta):
 
66
  processing = None
67
  if with_processing:
68
  processing = ProcessingInfo(
69
+ provider="google_ai_studio",
70
  model_id="gemini-2.0-flash",
71
  model_display_name="Gemini 2.0 Flash",
72
  prompt_version="prompts/medieval-illuminated/primary_v1.txt",
73
+ raw_response_path=f"/data/corpora/test/pages/{folio_label}/ai_raw.json",
74
  processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
75
  )
76
  ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
 
81
  folio_label=folio_label,
82
  sequence=sequence,
83
  image={
84
+ "master": original_url or f"https://example.com/{folio_label}.jpg",
85
  "derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg",
86
  "thumbnail": f"/data/thumb/{folio_label}.jpg",
87
  "width": 1500,
 
195
 
196
  def test_generate_mets_objid(beatus_pages, beatus_meta):
197
  root = _parse(generate_mets(beatus_pages, beatus_meta))
198
+ objid = root.get("OBJID")
199
+ assert objid is not None, "OBJID attribute absent du root mets"
200
+ assert objid == "BnF-Latin-8878"
201
 
202
 
203
  def test_generate_mets_label(beatus_pages, beatus_meta):
backend/tests/test_image_pipeline.py CHANGED
@@ -278,7 +278,6 @@ def test_fetch_iiif_image_success():
278
  "+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
279
  ),
280
  "Accept": "image/jpeg,image/png,image/*,*/*",
281
- "Referer": "https://gallica.bnf.fr/",
282
  },
283
  follow_redirects=True,
284
  timeout=60.0,
 
278
  "+https://huggingface.co/spaces/Ma-Ri-Ba-Ku/scriptorium-ai)"
279
  ),
280
  "Accept": "image/jpeg,image/png,image/*,*/*",
 
281
  },
282
  follow_redirects=True,
283
  timeout=60.0,
backend/tests/test_job_runner.py CHANGED
@@ -142,16 +142,24 @@ def _page_master(page_id: str, ms_id: str) -> PageMaster:
142
 
143
 
144
  def _apply_success_mocks(monkeypatch, page_id: str, ms_id: str) -> None:
145
- """Applique les mocks IO pour un pipeline rΓ©ussi."""
 
 
 
 
146
  monkeypatch.setattr(
147
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
148
  )
149
  monkeypatch.setattr(
150
- job_runner_module, "run_primary_analysis",
151
  lambda **kw: _page_master(page_id, ms_id),
152
  )
153
- monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "<alto/>")
154
- monkeypatch.setattr(job_runner_module, "write_alto", lambda xml, path: None)
 
 
 
 
155
 
156
 
157
  # ---------------------------------------------------------------------------
@@ -274,7 +282,7 @@ async def test_no_image_path_job_failed(db, setup_with_model, monkeypatch):
274
  s["page"].image_master_path = None
275
  await db.commit()
276
  monkeypatch.setattr(
277
- job_runner_module, "run_primary_analysis",
278
  lambda **kw: _page_master(s["page"].id, s["ms"].id),
279
  )
280
 
@@ -291,7 +299,7 @@ async def test_no_image_path_page_error(db, setup_with_model, monkeypatch):
291
  s["page"].image_master_path = None
292
  await db.commit()
293
  monkeypatch.setattr(
294
- job_runner_module, "run_primary_analysis",
295
  lambda **kw: _page_master(s["page"].id, s["ms"].id),
296
  )
297
 
@@ -343,7 +351,7 @@ async def test_primary_analysis_fails_job_failed(db, setup_with_model, monkeypat
343
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
344
  )
345
  monkeypatch.setattr(
346
- job_runner_module, "run_primary_analysis",
347
  lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
348
  )
349
 
@@ -361,7 +369,7 @@ async def test_primary_analysis_fails_page_error(db, setup_with_model, monkeypat
361
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
362
  )
363
  monkeypatch.setattr(
364
- job_runner_module, "run_primary_analysis",
365
  lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
366
  )
367
 
@@ -379,7 +387,7 @@ async def test_primary_analysis_error_message_stored(db, setup_with_model, monke
379
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
380
  )
381
  monkeypatch.setattr(
382
- job_runner_module, "run_primary_analysis",
383
  lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
384
  )
385
 
@@ -401,12 +409,14 @@ async def test_write_alto_fails_job_failed(db, setup_with_model, monkeypatch):
401
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
402
  )
403
  monkeypatch.setattr(
404
- job_runner_module, "run_primary_analysis",
405
  lambda **kw: _page_master(s["page"].id, s["ms"].id),
406
  )
407
- monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "<alto/>")
408
  monkeypatch.setattr(
409
- job_runner_module, "write_alto",
 
 
 
410
  lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
411
  )
412
 
@@ -424,12 +434,14 @@ async def test_write_alto_fails_page_error(db, setup_with_model, monkeypatch):
424
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
425
  )
426
  monkeypatch.setattr(
427
- job_runner_module, "run_primary_analysis",
428
  lambda **kw: _page_master(s["page"].id, s["ms"].id),
429
  )
430
- monkeypatch.setattr(job_runner_module, "generate_alto", lambda pm: "<alto/>")
431
  monkeypatch.setattr(
432
- job_runner_module, "write_alto",
 
 
 
433
  lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
434
  )
435
 
@@ -534,7 +546,7 @@ async def test_corpus_runner_calls_execute_per_job(monkeypatch):
534
  return _FakeSession()
535
 
536
  monkeypatch.setattr(corpus_runner_module, "async_session_factory", _mock_factory)
537
- monkeypatch.setattr(corpus_runner_module, "execute_page_job", _mock_execute)
538
 
539
  await execute_corpus_job("corpus-xyz")
540
 
 
142
 
143
 
144
  def _apply_success_mocks(monkeypatch, page_id: str, ms_id: str) -> None:
145
+ """Applique les mocks IO pour un pipeline rΓ©ussi.
146
+
147
+ Les imports sont diffΓ©rΓ©s dans job_runner (lazy imports). On patche donc
148
+ les modules sources pour que le import dans la fonction cible récupère le mock.
149
+ """
150
  monkeypatch.setattr(
151
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
152
  )
153
  monkeypatch.setattr(
154
+ "app.services.ai.analyzer.run_primary_analysis",
155
  lambda **kw: _page_master(page_id, ms_id),
156
  )
157
+ monkeypatch.setattr(
158
+ "app.services.export.alto.generate_alto", lambda pm: "<alto/>"
159
+ )
160
+ monkeypatch.setattr(
161
+ "app.services.export.alto.write_alto", lambda xml, path: None
162
+ )
163
 
164
 
165
  # ---------------------------------------------------------------------------
 
282
  s["page"].image_master_path = None
283
  await db.commit()
284
  monkeypatch.setattr(
285
+ "app.services.ai.analyzer.run_primary_analysis",
286
  lambda **kw: _page_master(s["page"].id, s["ms"].id),
287
  )
288
 
 
299
  s["page"].image_master_path = None
300
  await db.commit()
301
  monkeypatch.setattr(
302
+ "app.services.ai.analyzer.run_primary_analysis",
303
  lambda **kw: _page_master(s["page"].id, s["ms"].id),
304
  )
305
 
 
351
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
352
  )
353
  monkeypatch.setattr(
354
+ "app.services.ai.analyzer.run_primary_analysis",
355
  lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
356
  )
357
 
 
369
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
370
  )
371
  monkeypatch.setattr(
372
+ "app.services.ai.analyzer.run_primary_analysis",
373
  lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
374
  )
375
 
 
387
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
388
  )
389
  monkeypatch.setattr(
390
+ "app.services.ai.analyzer.run_primary_analysis",
391
  lambda **kw: (_ for _ in ()).throw(ValueError("ParseError: invalid JSON")),
392
  )
393
 
 
409
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
410
  )
411
  monkeypatch.setattr(
412
+ "app.services.ai.analyzer.run_primary_analysis",
413
  lambda **kw: _page_master(s["page"].id, s["ms"].id),
414
  )
 
415
  monkeypatch.setattr(
416
+ "app.services.export.alto.generate_alto", lambda pm: "<alto/>"
417
+ )
418
+ monkeypatch.setattr(
419
+ "app.services.export.alto.write_alto",
420
  lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
421
  )
422
 
 
434
  job_runner_module, "fetch_and_normalize", lambda *a: _image_info()
435
  )
436
  monkeypatch.setattr(
437
+ "app.services.ai.analyzer.run_primary_analysis",
438
  lambda **kw: _page_master(s["page"].id, s["ms"].id),
439
  )
 
440
  monkeypatch.setattr(
441
+ "app.services.export.alto.generate_alto", lambda pm: "<alto/>"
442
+ )
443
+ monkeypatch.setattr(
444
+ "app.services.export.alto.write_alto",
445
  lambda xml, path: (_ for _ in ()).throw(OSError("disk full")),
446
  )
447
 
 
546
  return _FakeSession()
547
 
548
  monkeypatch.setattr(corpus_runner_module, "async_session_factory", _mock_factory)
549
+ monkeypatch.setattr("app.services.job_runner.execute_page_job", _mock_execute)
550
 
551
  await execute_corpus_job("corpus-xyz")
552
 
backend/tests/test_security.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests de sΓ©curitΓ© β€” Sprint F1.
3
+
4
+ VΓ©rifie que toutes les vulnΓ©rabilitΓ©s identifiΓ©es sont corrigΓ©es :
5
+ - Path traversal sur profiles, slug, folio_label, frontend serving
6
+ - SSRF sur manifest_url
7
+ - Validation des entrΓ©es (taille, format)
8
+ """
9
+ # 1. stdlib
10
+ import pytest
11
+
12
+ # 2. third-party β€” fixtures API
13
+ from tests.conftest_api import async_client, db_session # noqa: F401
14
+
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Path traversal β€” profiles
18
+ # ---------------------------------------------------------------------------
19
+
20
+ @pytest.mark.asyncio
21
+ async def test_profile_path_traversal_dotdot(async_client):
22
+ """Un profile_id contenant '..' doit Γͺtre rejetΓ© (400)."""
23
+ resp = await async_client.get("/api/v1/profiles/..passwd")
24
+ assert resp.status_code == 400
25
+
26
+
27
+ @pytest.mark.asyncio
28
+ async def test_profile_path_traversal_slash(async_client):
29
+ """Un profile_id avec un slash (mΓͺme encodΓ©) doit Γͺtre rejetΓ© (400 ou 404)."""
30
+ # FastAPI normalise les chemins, donc un slash dans l'ID ne sera pas transmis.
31
+ # On teste avec un ID contenant des caractères spéciaux interdits.
32
+ resp = await async_client.get("/api/v1/profiles/UPPER_CASE")
33
+ assert resp.status_code == 400
34
+
35
+
36
+ @pytest.mark.asyncio
37
+ async def test_profile_path_traversal_special_chars(async_client):
38
+ """Un profile_id avec des caractΓ¨res spΓ©ciaux doit Γͺtre rejetΓ©."""
39
+ resp = await async_client.get("/api/v1/profiles/test@profile")
40
+ assert resp.status_code == 400
41
+
42
+
43
+ @pytest.mark.asyncio
44
+ async def test_profile_valid_id_not_found(async_client):
45
+ """Un profile_id valide mais inexistant retourne 404 (pas 400)."""
46
+ resp = await async_client.get("/api/v1/profiles/does-not-exist")
47
+ assert resp.status_code == 404
48
+
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Path traversal β€” corpus slug
52
+ # ---------------------------------------------------------------------------
53
+
54
+ @pytest.mark.asyncio
55
+ async def test_corpus_slug_path_traversal(async_client):
56
+ """Un slug avec ../ doit Γͺtre rejetΓ© par la validation Pydantic."""
57
+ resp = await async_client.post("/api/v1/corpora", json={
58
+ "slug": "../../malicious",
59
+ "title": "Test",
60
+ "profile_id": "medieval-illuminated",
61
+ })
62
+ assert resp.status_code == 422
63
+
64
+
65
+ @pytest.mark.asyncio
66
+ async def test_corpus_slug_with_spaces(async_client):
67
+ """Un slug avec des espaces doit Γͺtre rejetΓ©."""
68
+ resp = await async_client.post("/api/v1/corpora", json={
69
+ "slug": "my corpus",
70
+ "title": "Test",
71
+ "profile_id": "medieval-illuminated",
72
+ })
73
+ assert resp.status_code == 422
74
+
75
+
76
+ @pytest.mark.asyncio
77
+ async def test_corpus_slug_uppercase(async_client):
78
+ """Un slug avec des majuscules doit Γͺtre rejetΓ© (lowercase only)."""
79
+ resp = await async_client.post("/api/v1/corpora", json={
80
+ "slug": "MyCorpus",
81
+ "title": "Test",
82
+ "profile_id": "medieval-illuminated",
83
+ })
84
+ assert resp.status_code == 422
85
+
86
+
87
+ @pytest.mark.asyncio
88
+ async def test_corpus_slug_valid(async_client):
89
+ """Un slug valide doit Γͺtre acceptΓ©."""
90
+ resp = await async_client.post("/api/v1/corpora", json={
91
+ "slug": "my-corpus-01",
92
+ "title": "Test",
93
+ "profile_id": "medieval-illuminated",
94
+ })
95
+ assert resp.status_code == 201
96
+
97
+
98
+ @pytest.mark.asyncio
99
+ async def test_corpus_slug_empty(async_client):
100
+ """Un slug vide doit Γͺtre rejetΓ©."""
101
+ resp = await async_client.post("/api/v1/corpora", json={
102
+ "slug": "",
103
+ "title": "Test",
104
+ "profile_id": "medieval-illuminated",
105
+ })
106
+ assert resp.status_code == 422
107
+
108
+
109
+ @pytest.mark.asyncio
110
+ async def test_corpus_title_too_long(async_client):
111
+ """Un titre trop long (>256 chars) doit Γͺtre rejetΓ©."""
112
+ resp = await async_client.post("/api/v1/corpora", json={
113
+ "slug": "test-long",
114
+ "title": "x" * 300,
115
+ "profile_id": "medieval-illuminated",
116
+ })
117
+ assert resp.status_code == 422
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # SSRF β€” manifest_url
122
+ # ---------------------------------------------------------------------------
123
+
124
+ @pytest.mark.asyncio
125
+ async def test_ssrf_localhost(async_client):
126
+ """Un manifest_url pointant vers localhost doit Γͺtre rejetΓ©."""
127
+ # CrΓ©er un corpus d'abord
128
+ create = await async_client.post("/api/v1/corpora", json={
129
+ "slug": "ssrf-test", "title": "SSRF", "profile_id": "test",
130
+ })
131
+ cid = create.json()["id"]
132
+
133
+ resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
134
+ "manifest_url": "http://localhost:8000/secret",
135
+ })
136
+ assert resp.status_code == 400
137
+ assert "interdit" in resp.json()["detail"].lower() or "localhost" in resp.json()["detail"].lower()
138
+
139
+
140
+ @pytest.mark.asyncio
141
+ async def test_ssrf_metadata_ip(async_client):
142
+ """Un manifest_url vers 169.254.x.x (cloud metadata) doit Γͺtre rejetΓ©."""
143
+ create = await async_client.post("/api/v1/corpora", json={
144
+ "slug": "ssrf-meta", "title": "SSRF", "profile_id": "test",
145
+ })
146
+ cid = create.json()["id"]
147
+
148
+ resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
149
+ "manifest_url": "http://169.254.169.254/latest/meta-data/",
150
+ })
151
+ assert resp.status_code == 400
152
+
153
+
154
+ @pytest.mark.asyncio
155
+ async def test_ssrf_file_scheme(async_client):
156
+ """Un manifest_url avec file:// doit Γͺtre rejetΓ©."""
157
+ create = await async_client.post("/api/v1/corpora", json={
158
+ "slug": "ssrf-file", "title": "SSRF", "profile_id": "test",
159
+ })
160
+ cid = create.json()["id"]
161
+
162
+ resp = await async_client.post(f"/api/v1/corpora/{cid}/ingest/iiif-manifest", json={
163
+ "manifest_url": "file:///etc/passwd",
164
+ })
165
+ assert resp.status_code == 400
166
+
167
+
168
+ # ---------------------------------------------------------------------------
169
+ # Input validation β€” search
170
+ # ---------------------------------------------------------------------------
171
+
172
+ @pytest.mark.asyncio
173
+ async def test_search_query_too_long(async_client):
174
+ """Une requΓͺte de recherche >500 chars doit Γͺtre rejetΓ©e."""
175
+ resp = await async_client.get("/api/v1/search", params={"q": "x" * 501})
176
+ assert resp.status_code == 422
177
+
178
+
179
+ @pytest.mark.asyncio
180
+ async def test_search_query_max_length_ok(async_client):
181
+ """Une requΓͺte de recherche de 500 chars doit Γͺtre acceptΓ©e (0 rΓ©sultat)."""
182
+ resp = await async_client.get("/api/v1/search", params={"q": "x" * 500})
183
+ assert resp.status_code == 200
184
+
185
+
186
+ # ---------------------------------------------------------------------------
187
+ # Input validation β€” model selection
188
+ # ---------------------------------------------------------------------------
189
+
190
+ @pytest.mark.asyncio
191
+ async def test_model_id_too_long(async_client):
192
+ """Un model_id >256 chars doit Γͺtre rejetΓ©."""
193
+ create = await async_client.post("/api/v1/corpora", json={
194
+ "slug": "model-test", "title": "T", "profile_id": "test",
195
+ })
196
+ cid = create.json()["id"]
197
+
198
+ resp = await async_client.put(f"/api/v1/corpora/{cid}/model", json={
199
+ "model_id": "x" * 300,
200
+ "provider_type": "google_ai_studio",
201
+ })
202
+ assert resp.status_code == 422
203
+
204
+
205
+ # ---------------------------------------------------------------------------
206
+ # Input validation β€” corrections
207
+ # ---------------------------------------------------------------------------
208
+
209
+ @pytest.mark.asyncio
210
+ async def test_corrections_restore_negative_version(async_client):
211
+ """restore_to_version < 1 doit Γͺtre rejetΓ©."""
212
+ resp = await async_client.post("/api/v1/pages/fake-page/corrections", json={
213
+ "restore_to_version": 0,
214
+ })
215
+ assert resp.status_code == 422
frontend/src/App.tsx CHANGED
@@ -42,6 +42,7 @@ export default function App() {
42
  onOpenManuscript={(manuscriptId, profileId) =>
43
  setView({ name: 'reader', manuscriptId, profileId })
44
  }
 
45
  onAdmin={() => setView({ name: 'admin' })}
46
  />
47
  )
 
42
  onOpenManuscript={(manuscriptId, profileId) =>
43
  setView({ name: 'reader', manuscriptId, profileId })
44
  }
45
+ onOpenPage={(pageId) => setView({ name: 'editor', pageId })}
46
  onAdmin={() => setView({ name: 'admin' })}
47
  />
48
  )
frontend/src/lib/api.ts CHANGED
@@ -1,5 +1,13 @@
1
  const BASE_URL: string = import.meta.env.VITE_API_URL ?? ''
2
 
 
 
 
 
 
 
 
 
3
  // ── Types ─────────────────────────────────────────────────────────────────────
4
 
5
  export interface ProviderInfo {
 
1
  const BASE_URL: string = import.meta.env.VITE_API_URL ?? ''
2
 
3
+ if (!BASE_URL && import.meta.env.PROD) {
4
+ console.warn(
5
+ '[Scriptorium] VITE_API_URL non dΓ©fini en production. ' +
6
+ 'Les appels API utiliseront des chemins relatifs, ce qui peut Γ©chouer ' +
7
+ 'si le frontend n\'est pas servi par le mΓͺme domaine que le backend.'
8
+ )
9
+ }
10
+
11
  // ── Types ─────────────────────────────────────────────────────────────────────
12
 
13
  export interface ProviderInfo {
frontend/src/pages/Editor.tsx CHANGED
@@ -119,7 +119,7 @@ export default function Editor({ pageId, onBack }: Props) {
119
  return <div className="p-8 text-red-600">Erreur : {error}</div>
120
  }
121
 
122
- const imageUrl = master ? '' : '' // image path not directly stored on PageMaster
123
  const regions = master?.layout?.regions ?? []
124
 
125
  return (
 
119
  return <div className="p-8 text-red-600">Erreur : {error}</div>
120
  }
121
 
122
+ const imageUrl = master?.image?.derivative_web ?? master?.image?.master ?? ''
123
  const regions = master?.layout?.regions ?? []
124
 
125
  return (
frontend/src/pages/Home.tsx CHANGED
@@ -10,10 +10,11 @@ import {
10
 
11
  interface Props {
12
  onOpenManuscript: (manuscriptId: string, profileId: string) => void
 
13
  onAdmin: () => void
14
  }
15
 
16
- export default function Home({ onOpenManuscript, onAdmin }: Props) {
17
  const [corpora, setCorpora] = useState<Corpus[]>([])
18
  const [loading, setLoading] = useState(true)
19
  const [error, setError] = useState<string | null>(null)
@@ -73,7 +74,7 @@ export default function Home({ onOpenManuscript, onAdmin }: Props) {
73
  </p>
74
  </div>
75
  <div className="flex items-center gap-4">
76
- <SearchBar />
77
  <AdminNav onClick={onAdmin} />
78
  </div>
79
  </header>
 
10
 
11
  interface Props {
12
  onOpenManuscript: (manuscriptId: string, profileId: string) => void
13
+ onOpenPage?: (pageId: string) => void
14
  onAdmin: () => void
15
  }
16
 
17
+ export default function Home({ onOpenManuscript, onOpenPage, onAdmin }: Props) {
18
  const [corpora, setCorpora] = useState<Corpus[]>([])
19
  const [loading, setLoading] = useState(true)
20
  const [error, setError] = useState<string | null>(null)
 
74
  </p>
75
  </div>
76
  <div className="flex items-center gap-4">
77
+ <SearchBar onSelectResult={onOpenPage ? (r) => onOpenPage(r.page_id) : undefined} />
78
  <AdminNav onClick={onAdmin} />
79
  </div>
80
  </header>
infra/Dockerfile DELETED
@@ -1,71 +0,0 @@
1
- # Scriptorium AI β€” image de production (multi-stage)
2
- # Ce fichier est la copie exacte de Dockerfile (racine).
3
- # Build depuis la racine du dΓ©pΓ΄t :
4
- # docker build -f infra/Dockerfile -t scriptorium-ai .
5
- #
6
- # Structure attendue dans l'image :
7
- # /app/backend/app/ ← source Python (importable via PYTHONPATH)
8
- # /app/profiles/ ← profils JSON
9
- # /app/prompts/ ← templates de prompts
10
- # /app/static/ ← frontend React buildΓ©
11
- # /app/data/ ← créé vide ; Γ  monter en volume pour les artefacts
12
-
13
- # ── Stage 1 : build du frontend React ────────────────────────────────────────
14
- FROM node:20-slim AS frontend-builder
15
-
16
- WORKDIR /frontend
17
-
18
- # Installer les dΓ©pendances (cache layer sΓ©parΓ©)
19
- COPY frontend/package.json ./
20
- RUN npm install
21
-
22
- # Copier les sources et builder
23
- COPY frontend/ ./
24
- RUN npm run build
25
-
26
- # ── Stage 2 : image Python finale ────────────────────────────────────────────
27
- FROM python:3.11-slim
28
-
29
- WORKDIR /app
30
-
31
- # ── DΓ©pendances Python ─────────────────────────────────────────────────────
32
- # On copie uniquement pyproject.toml pour exploiter le cache de layers Docker.
33
- # Un stub app/__init__.py satisfait setuptools (discover packages) sans avoir
34
- # besoin de copier tout le code source Γ  ce stade.
35
- COPY backend/pyproject.toml /tmp/build/
36
- RUN mkdir -p /tmp/build/app \
37
- && touch /tmp/build/app/__init__.py \
38
- && pip install --no-cache-dir --upgrade /tmp/build/ \
39
- && rm -rf /tmp/build
40
-
41
- # ── Layer dΓ©diΓ© mistralai β€” invalide le cache HF si v0.x est prΓ©sent ─────
42
- # Layer sΓ©parΓ© de l'install principal pour forcer la mise Γ  jour mΓͺme si
43
- # HuggingFace rΓ©utilise le layer pyproject.toml depuis un build antΓ©rieur.
44
- RUN pip install --no-cache-dir 'mistralai>=1.0,<2.0'
45
-
46
- # ── Code source backend ────────────────────────────────────────────────────
47
- COPY backend/app ./backend/app
48
- COPY profiles/ ./profiles/
49
- COPY prompts/ ./prompts/
50
-
51
- # ── Frontend buildΓ© ────────────────────────────────────────────────────────
52
- COPY --from=frontend-builder /frontend/dist ./static
53
-
54
- # ── RΓ©pertoire des artefacts (vide dans l'image ; montΓ© en volume) ─────────
55
- RUN mkdir -p /app/data
56
-
57
- # ── Secrets Google AI : JAMAIS dans l'image (R06) ─────────────────────────
58
- # Passer au runtime via -e ou docker-compose environment :
59
- # AI_PROVIDER, GOOGLE_AI_STUDIO_API_KEY, GOOGLE_AI_API_KEY,
60
- # GOOGLE_VERTEX_PROJECT, GOOGLE_VERTEX_LOCATION
61
-
62
- # PYTHONPATH permet l'import `app.main:app` depuis /app/backend/app/
63
- ENV PYTHONPATH=/app/backend
64
- ENV PROFILES_DIR=/app/profiles
65
- ENV PROMPTS_DIR=/app/prompts
66
- ENV DATA_DIR=/app/data
67
-
68
- EXPOSE 7860
69
-
70
- # 1 worker au MVP β€” pas de Gunicorn, pas de multiprocessing
71
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]