Claude commited on
Commit
9b4e099
·
unverified ·
1 Parent(s): f338ad4

feat(pipeline): IIIF-native Sprint 3 — in-memory image fetch for AI analysis

Browse files

Replace disk-based image pipeline with IIIF-native streaming for pages
that have a IIIF Image Service URL:

- fetch_iiif_derivative(): fetches a 1500px JPEG via IIIF Image API
({service_url}/full/!1500,1500/0/default.jpg) — server-side resize
- fetch_ai_derivative_bytes(): returns (jpeg_bytes, w, h) in memory,
never writes to disk. Falls back to full download + in-memory resize
for non-IIIF sources.
- analyzer.py: now accepts derivative_image_bytes (bytes) OR
derivative_image_path (Path). New _scale_bbox_coordinates() scales
bounding boxes from derivative space to original canvas space when
dimensions differ (e.g., 1500px derivative → 5000px original).
- job_runner.py: 3 code paths:
1. IIIF native (iiif_service_url set): fetch in memory → analyze → discard
2. URL fallback (legacy): download → create_derivatives on disk → analyze
3. Local file (upload): read → create_derivatives on disk → analyze
- ImageInfo in master.json now stores iiif_service_url + original canvas
dimensions (not derivative dimensions) for IIIF-native pages.

2 new tests: IIIF bytes mode, bbox coordinate scaling (4x factor).
585 tests pass (+2 new), 0 regressions.

https://claude.ai/code/session_01UB4he7RdRPHLvNjky4X8Sw

backend/app/services/ai/analyzer.py CHANGED
@@ -11,7 +11,7 @@ from pathlib import Path
11
 
12
  # 3. local
13
  from app.schemas.corpus_profile import CorpusProfile
14
- from app.schemas.image import ImageDerivativeInfo
15
  from app.schemas.model_config import ModelConfig
16
  from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
17
  from app.services.ai.master_writer import write_ai_raw, write_master_json
@@ -22,8 +22,32 @@ from app.services.ai.response_parser import ParseError, parse_ai_response # noq
22
  logger = logging.getLogger(__name__)
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def run_primary_analysis(
26
- derivative_image_path: Path,
 
 
27
  corpus_profile: CorpusProfile,
28
  model_config: ModelConfig,
29
  page_id: str,
@@ -31,38 +55,22 @@ def run_primary_analysis(
31
  corpus_slug: str,
32
  folio_label: str,
33
  sequence: int,
34
- image_info: ImageDerivativeInfo,
 
 
35
  base_data_dir: Path = Path("data"),
36
  project_root: Path = Path("."),
37
  ) -> PageMaster:
38
  """Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
39
 
40
- Respecte R05 : ai_raw.json est toujours écrit en premier, même en cas
41
- d'erreur de parsing. master.json n'est écrit QUE si le parsing a réussi.
42
-
43
- Le provider est sélectionné dynamiquement depuis model_config.provider ;
44
- Google AI Studio, Vertex et Mistral sont supportés de façon identique.
45
-
46
- Args:
47
- derivative_image_path: chemin vers le JPEG dérivé (1500px max).
48
- corpus_profile: profil du corpus (pilote le prompt et les layers).
49
- model_config: configuration du modèle sélectionné (provider + model_id).
50
- page_id: identifiant unique de la page (ex. "beatus-lat8878-0013r").
51
- manuscript_id: identifiant du manuscrit.
52
- corpus_slug: identifiant du corpus (ex. "beatus-lat8878").
53
- folio_label: label du folio (ex. "0013r").
54
- sequence: numéro de séquence dans le manuscrit.
55
- image_info: métadonnées de l'image normalisée (dimensions, chemins).
56
- base_data_dir: racine du dossier data.
57
- project_root: racine du projet (pour résoudre les chemins des prompts).
58
-
59
- Returns:
60
- PageMaster validé (ai_raw.json et master.json écrits sur disque).
61
-
62
- Raises:
63
- ParseError: si la réponse IA n'est pas un JSON valide.
64
- FileNotFoundError: si le template de prompt est introuvable.
65
- RuntimeError: si le provider n'est pas configuré (variable d'env absente).
66
  """
67
  # ── Chemins de sortie ───────────────────────────────────────────────────
68
  page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
@@ -85,13 +93,18 @@ def run_primary_analysis(
85
  extra={"template": prompt_rel_path, "corpus": corpus_slug, "folio": folio_label},
86
  )
87
 
88
- # ── 2. Chargement de l'image dérivée ────────────────────────────────────
89
- if not derivative_image_path.exists():
90
- raise FileNotFoundError(f"Image dérivée introuvable : {derivative_image_path}")
91
- try:
92
- jpeg_bytes = derivative_image_path.read_bytes()
93
- except OSError as exc:
94
- raise RuntimeError(f"Erreur lecture image {derivative_image_path} : {exc}") from exc
 
 
 
 
 
95
 
96
  # ── 3. Appel IA via le provider sélectionné ─────────────────────────────
97
  provider = get_provider(model_config.provider)
@@ -116,21 +129,45 @@ def run_primary_analysis(
116
  # ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
117
  layout, ocr = parse_ai_response(raw_text)
118
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # ── 6. Construction du PageMaster ───────────────────────────────────────
120
  processed_at = datetime.now(tz=timezone.utc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  page_master = PageMaster(
122
  page_id=page_id,
123
  corpus_profile=corpus_profile.profile_id,
124
  manuscript_id=manuscript_id,
125
  folio_label=folio_label,
126
  sequence=sequence,
127
- image=ImageInfo(
128
- master=image_info.original_url,
129
- derivative_web=image_info.derivative_path,
130
- thumbnail=image_info.thumbnail_path,
131
- width=image_info.derivative_width,
132
- height=image_info.derivative_height,
133
- ),
134
  layout=layout,
135
  ocr=ocr,
136
  processing=ProcessingInfo(
@@ -154,6 +191,7 @@ def run_primary_analysis(
154
  "corpus": corpus_slug,
155
  "folio": folio_label,
156
  "regions": len(layout.get("regions", [])),
 
157
  },
158
  )
159
  return page_master
 
11
 
12
  # 3. local
13
  from app.schemas.corpus_profile import CorpusProfile
14
+ from app.schemas.image import ImageDerivativeInfo, ImageSourceInfo
15
  from app.schemas.model_config import ModelConfig
16
  from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
17
  from app.services.ai.master_writer import write_ai_raw, write_master_json
 
22
  logger = logging.getLogger(__name__)
23
 
24
 
25
+ def _scale_bbox_coordinates(layout: dict, scale_x: float, scale_y: float) -> dict:
26
+ """Met à l'échelle les bbox de l'espace dérivé vers l'espace canvas original.
27
+
28
+ L'IA analyse un dérivé 1500px mais les coordonnées dans master.json
29
+ doivent être en pixels absolus du canvas original (convention IIIF).
30
+ """
31
+ if abs(scale_x - 1.0) < 0.01 and abs(scale_y - 1.0) < 0.01:
32
+ return layout # pas de scaling nécessaire
33
+
34
+ regions = layout.get("regions", [])
35
+ for region in regions:
36
+ bbox = region.get("bbox")
37
+ if bbox and len(bbox) == 4:
38
+ region["bbox"] = [
39
+ round(bbox[0] * scale_x),
40
+ round(bbox[1] * scale_y),
41
+ round(bbox[2] * scale_x),
42
+ round(bbox[3] * scale_y),
43
+ ]
44
+ return layout
45
+
46
+
47
  def run_primary_analysis(
48
+ *,
49
+ derivative_image_bytes: bytes | None = None,
50
+ derivative_image_path: Path | None = None,
51
  corpus_profile: CorpusProfile,
52
  model_config: ModelConfig,
53
  page_id: str,
 
55
  corpus_slug: str,
56
  folio_label: str,
57
  sequence: int,
58
+ image_info: ImageDerivativeInfo | ImageSourceInfo,
59
+ derivative_width: int | None = None,
60
+ derivative_height: int | None = None,
61
  base_data_dir: Path = Path("data"),
62
  project_root: Path = Path("."),
63
  ) -> PageMaster:
64
  """Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
65
 
66
+ Supporte deux modes :
67
+ - IIIF natif : derivative_image_bytes fourni (bytes en RAM, jamais sur disque)
68
+ - Legacy : derivative_image_path fourni (chemin fichier sur disque)
69
+
70
+ Respecte R05 : ai_raw.json toujours écrit en premier.
71
+
72
+ Si les dimensions originales (canvas) diffèrent du dérivé, les bbox sont
73
+ mises à l'échelle de l'espace dérivé vers l'espace canvas original.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  """
75
  # ── Chemins de sortie ───────────────────────────────────────────────────
76
  page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
 
93
  extra={"template": prompt_rel_path, "corpus": corpus_slug, "folio": folio_label},
94
  )
95
 
96
+ # ── 2. Obtention des bytes image ────────────────────────────────────────
97
+ if derivative_image_bytes is not None:
98
+ jpeg_bytes = derivative_image_bytes
99
+ elif derivative_image_path is not None:
100
+ if not derivative_image_path.exists():
101
+ raise FileNotFoundError(f"Image dérivée introuvable : {derivative_image_path}")
102
+ try:
103
+ jpeg_bytes = derivative_image_path.read_bytes()
104
+ except OSError as exc:
105
+ raise RuntimeError(f"Erreur lecture image {derivative_image_path} : {exc}") from exc
106
+ else:
107
+ raise ValueError("Il faut fournir derivative_image_bytes ou derivative_image_path")
108
 
109
  # ── 3. Appel IA via le provider sélectionné ─────────────────────────────
110
  provider = get_provider(model_config.provider)
 
129
  # ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
130
  layout, ocr = parse_ai_response(raw_text)
131
 
132
+ # ── 5b. Scaling bbox si les dimensions originales diffèrent du dérivé ──
133
+ is_iiif_source = isinstance(image_info, ImageSourceInfo)
134
+ original_w = image_info.original_width
135
+ original_h = image_info.original_height
136
+ deriv_w = derivative_width or (getattr(image_info, "derivative_width", None)) or original_w
137
+ deriv_h = derivative_height or (getattr(image_info, "derivative_height", None)) or original_h
138
+
139
+ if original_w > 0 and deriv_w > 0 and (original_w != deriv_w or original_h != deriv_h):
140
+ scale_x = original_w / deriv_w
141
+ scale_y = original_h / deriv_h
142
+ layout = _scale_bbox_coordinates(layout, scale_x, scale_y)
143
+
144
  # ── 6. Construction du PageMaster ───────────────────────────────────────
145
  processed_at = datetime.now(tz=timezone.utc)
146
+
147
+ if is_iiif_source:
148
+ image_block = ImageInfo(
149
+ master=image_info.original_url,
150
+ iiif_service_url=image_info.iiif_service_url,
151
+ manifest_url=image_info.manifest_url,
152
+ width=original_w,
153
+ height=original_h,
154
+ )
155
+ else:
156
+ image_block = ImageInfo(
157
+ master=image_info.original_url,
158
+ derivative_web=getattr(image_info, "derivative_path", None),
159
+ thumbnail=getattr(image_info, "thumbnail_path", None),
160
+ width=original_w,
161
+ height=original_h,
162
+ )
163
+
164
  page_master = PageMaster(
165
  page_id=page_id,
166
  corpus_profile=corpus_profile.profile_id,
167
  manuscript_id=manuscript_id,
168
  folio_label=folio_label,
169
  sequence=sequence,
170
+ image=image_block,
 
 
 
 
 
 
171
  layout=layout,
172
  ocr=ocr,
173
  processing=ProcessingInfo(
 
191
  "corpus": corpus_slug,
192
  "folio": folio_label,
193
  "regions": len(layout.get("regions", [])),
194
+ "iiif_native": is_iiif_source,
195
  },
196
  )
197
  return page_master
backend/app/services/image/normalizer.py CHANGED
@@ -143,3 +143,55 @@ def fetch_and_normalize(
143
  """
144
  source_bytes = fetch_iiif_image(url)
145
  return create_derivatives(source_bytes, url, corpus_slug, folio_label, base_data_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  """
144
  source_bytes = fetch_iiif_image(url)
145
  return create_derivatives(source_bytes, url, corpus_slug, folio_label, base_data_dir)
146
+
147
+
148
+ # ── Mode IIIF natif : images en mémoire, jamais sur disque ───────────────────
149
+
150
+ def fetch_ai_derivative_bytes(
151
+ iiif_service_url: str | None,
152
+ fallback_url: str | None,
153
+ ) -> tuple[bytes, int, int]:
154
+ """Retourne (jpeg_bytes, width, height) pour l'IA — jamais sauvé sur disque.
155
+
156
+ - Si iiif_service_url est fourni : utilise l'IIIF Image API pour demander
157
+ au serveur un dérivé 1500px directement redimensionné côté serveur.
158
+ - Sinon (fallback_url) : télécharge l'image complète et redimensionne
159
+ en mémoire.
160
+
161
+ Returns:
162
+ Tuple (jpeg_bytes, derivative_width, derivative_height).
163
+
164
+ Raises:
165
+ ValueError: si aucune source n'est fournie.
166
+ httpx.HTTPStatusError: si le serveur retourne une erreur.
167
+ """
168
+ from app.services.ingest.iiif_fetcher import fetch_iiif_derivative, fetch_iiif_image
169
+
170
+ if iiif_service_url:
171
+ raw_bytes = fetch_iiif_derivative(iiif_service_url, max_px=_MAX_DERIVATIVE_PX)
172
+ elif fallback_url:
173
+ raw_bytes = fetch_iiif_image(fallback_url)
174
+ else:
175
+ raise ValueError("Aucune source image fournie (ni iiif_service_url ni fallback_url)")
176
+
177
+ # Ouvrir en mémoire pour obtenir les dimensions (et redimensionner si fallback)
178
+ image = Image.open(io.BytesIO(raw_bytes))
179
+ if image.mode != "RGB":
180
+ image = image.convert("RGB")
181
+
182
+ if not iiif_service_url:
183
+ # Fallback : le serveur n'a pas redimensionné, on le fait en mémoire
184
+ image = _resize_to_max(image, _MAX_DERIVATIVE_PX)
185
+
186
+ w, h = image.size
187
+
188
+ # Encoder en JPEG en mémoire
189
+ buf = io.BytesIO()
190
+ image.save(buf, format="JPEG", quality=_DERIVATIVE_QUALITY)
191
+ jpeg_bytes = buf.getvalue()
192
+
193
+ logger.info(
194
+ "Dérivé IA en mémoire",
195
+ extra={"iiif": bool(iiif_service_url), "size": f"{w}x{h}", "bytes": len(jpeg_bytes)},
196
+ )
197
+ return jpeg_bytes, w, h
backend/app/services/ingest/iiif_fetcher.py CHANGED
@@ -48,3 +48,38 @@ def fetch_iiif_image(url: str, timeout: float = _DEFAULT_TIMEOUT) -> bytes:
48
  extra={"url": url, "size_bytes": len(response.content)},
49
  )
50
  return response.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  extra={"url": url, "size_bytes": len(response.content)},
49
  )
50
  return response.content
51
+
52
+
53
+ def fetch_iiif_derivative(
54
+ service_url: str,
55
+ max_px: int = 1500,
56
+ timeout: float = _DEFAULT_TIMEOUT,
57
+ ) -> bytes:
58
+ """Télécharge un dérivé via l'IIIF Image API — jamais stocké sur disque.
59
+
60
+ Construit l'URL : {service_url}/full/!{max_px},{max_px}/0/default.jpg
61
+ Le serveur IIIF retourne une image redimensionnée côté serveur.
62
+
63
+ Args:
64
+ service_url: URL du IIIF Image Service (sans le suffix /full/.../default.jpg).
65
+ max_px: taille max du grand côté (défaut : 1500).
66
+ timeout: délai maximal en secondes.
67
+
68
+ Returns:
69
+ Contenu brut de l'image JPEG en bytes.
70
+ """
71
+ # Pattern IIIF Image API : !w,h = "best fit" (le serveur choisit)
72
+ derivative_url = f"{service_url.rstrip('/')}/full/!{max_px},{max_px}/0/default.jpg"
73
+ logger.info("Fetching IIIF derivative", extra={"url": derivative_url, "max_px": max_px})
74
+ response = httpx.get(
75
+ derivative_url,
76
+ headers=_HEADERS,
77
+ follow_redirects=True,
78
+ timeout=httpx.Timeout(timeout, connect=10.0),
79
+ )
80
+ response.raise_for_status()
81
+ logger.info(
82
+ "IIIF derivative fetched",
83
+ extra={"url": derivative_url, "size_bytes": len(response.content)},
84
+ )
85
+ return response.content
backend/app/services/job_runner.py CHANGED
@@ -32,8 +32,13 @@ from app.models.database import async_session_factory
32
  from app.models.job import JobModel
33
  from app.models.model_config_db import ModelConfigDB
34
  from app.schemas.corpus_profile import CorpusProfile
 
35
  from app.schemas.model_config import ModelConfig, ProviderType
36
- from app.services.image.normalizer import create_derivatives, fetch_and_normalize
 
 
 
 
37
 
38
  logger = logging.getLogger(__name__)
39
 
@@ -126,19 +131,65 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
126
  available_models=[],
127
  )
128
 
129
- # ── 5. Normaliser l'image ────────────────────────────────────────────
130
  data_dir = _config_module.settings.data_dir
131
  image_source = page.image_master_path or ""
132
 
133
- if image_source.startswith(("http://", "https://")):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  image_info = fetch_and_normalize(
135
  image_source, corpus.slug, page.folio_label, data_dir
136
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  elif image_source:
138
- # Validation anti path-traversal : le chemin résolu doit être
139
- # sous data_dir. Empêche la lecture de fichiers arbitraires
140
- # si image_master_path contient des séquences ../ ou un
141
- # chemin absolu hors du répertoire de données.
142
  source_path = Path(image_source).resolve()
143
  data_dir_resolved = data_dir.resolve()
144
  if not str(source_path).startswith(str(data_dir_resolved) + "/") and source_path != data_dir_resolved:
@@ -150,29 +201,26 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
150
  image_info = create_derivatives(
151
  source_bytes, image_source, corpus.slug, page.folio_label, data_dir
152
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  else:
154
  raise ValueError(
155
  f"La page {page.id} n'a pas d'image source "
156
- "(image_master_path vide ou None)"
157
  )
158
 
159
- # ── 6. Analyse primaire IA (R05 : double stockage) ───────────────────
160
- from app.services.ai.analyzer import run_primary_analysis
161
-
162
- page_master = run_primary_analysis(
163
- derivative_image_path=Path(image_info.derivative_path),
164
- corpus_profile=corpus_profile,
165
- model_config=model_config,
166
- page_id=page.id,
167
- manuscript_id=manuscript.id,
168
- corpus_slug=corpus.slug,
169
- folio_label=page.folio_label,
170
- sequence=page.sequence,
171
- image_info=image_info,
172
- base_data_dir=data_dir,
173
- project_root=_PROJECT_ROOT,
174
- )
175
-
176
  # ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
177
  from app.services.export.alto import generate_alto, write_alto
178
 
 
32
  from app.models.job import JobModel
33
  from app.models.model_config_db import ModelConfigDB
34
  from app.schemas.corpus_profile import CorpusProfile
35
+ from app.schemas.image import ImageSourceInfo
36
  from app.schemas.model_config import ModelConfig, ProviderType
37
+ from app.services.image.normalizer import (
38
+ create_derivatives,
39
+ fetch_ai_derivative_bytes,
40
+ fetch_and_normalize,
41
+ )
42
 
43
  logger = logging.getLogger(__name__)
44
 
 
131
  available_models=[],
132
  )
133
 
134
+ # ── 5. Obtenir l'image pour l'IA ─────────────────────────────────────
135
  data_dir = _config_module.settings.data_dir
136
  image_source = page.image_master_path or ""
137
 
138
+ from app.services.ai.analyzer import run_primary_analysis
139
+
140
+ if page.iiif_service_url:
141
+ # ── Mode IIIF natif : fetch en mémoire, zéro stockage ────────────
142
+ deriv_bytes, deriv_w, deriv_h = fetch_ai_derivative_bytes(
143
+ iiif_service_url=page.iiif_service_url,
144
+ fallback_url=None,
145
+ )
146
+ image_source_info = ImageSourceInfo(
147
+ original_url=image_source or page.iiif_service_url,
148
+ iiif_service_url=page.iiif_service_url,
149
+ manifest_url=page.manifest_url,
150
+ is_iiif=True,
151
+ original_width=page.canvas_width or deriv_w,
152
+ original_height=page.canvas_height or deriv_h,
153
+ )
154
+
155
+ # ── 6. Analyse primaire IA (R05 : double stockage) ───────────────
156
+ page_master = run_primary_analysis(
157
+ derivative_image_bytes=deriv_bytes,
158
+ derivative_width=deriv_w,
159
+ derivative_height=deriv_h,
160
+ corpus_profile=corpus_profile,
161
+ model_config=model_config,
162
+ page_id=page.id,
163
+ manuscript_id=manuscript.id,
164
+ corpus_slug=corpus.slug,
165
+ folio_label=page.folio_label,
166
+ sequence=page.sequence,
167
+ image_info=image_source_info,
168
+ base_data_dir=data_dir,
169
+ project_root=_PROJECT_ROOT,
170
+ )
171
+
172
+ elif image_source.startswith(("http://", "https://")):
173
+ # ── Mode fallback URL : télécharge + stocke sur disque (legacy) ──
174
  image_info = fetch_and_normalize(
175
  image_source, corpus.slug, page.folio_label, data_dir
176
  )
177
+ page_master = run_primary_analysis(
178
+ derivative_image_path=Path(image_info.derivative_path),
179
+ corpus_profile=corpus_profile,
180
+ model_config=model_config,
181
+ page_id=page.id,
182
+ manuscript_id=manuscript.id,
183
+ corpus_slug=corpus.slug,
184
+ folio_label=page.folio_label,
185
+ sequence=page.sequence,
186
+ image_info=image_info,
187
+ base_data_dir=data_dir,
188
+ project_root=_PROJECT_ROOT,
189
+ )
190
+
191
  elif image_source:
192
+ # ── Mode fichier local (upload) ──────────────────────────────────
 
 
 
193
  source_path = Path(image_source).resolve()
194
  data_dir_resolved = data_dir.resolve()
195
  if not str(source_path).startswith(str(data_dir_resolved) + "/") and source_path != data_dir_resolved:
 
201
  image_info = create_derivatives(
202
  source_bytes, image_source, corpus.slug, page.folio_label, data_dir
203
  )
204
+ page_master = run_primary_analysis(
205
+ derivative_image_path=Path(image_info.derivative_path),
206
+ corpus_profile=corpus_profile,
207
+ model_config=model_config,
208
+ page_id=page.id,
209
+ manuscript_id=manuscript.id,
210
+ corpus_slug=corpus.slug,
211
+ folio_label=page.folio_label,
212
+ sequence=page.sequence,
213
+ image_info=image_info,
214
+ base_data_dir=data_dir,
215
+ project_root=_PROJECT_ROOT,
216
+ )
217
+
218
  else:
219
  raise ValueError(
220
  f"La page {page.id} n'a pas d'image source "
221
+ "(ni iiif_service_url, ni image_master_path)"
222
  )
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  # ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
225
  from app.services.export.alto import generate_alto, write_alto
226
 
backend/tests/test_ai_analyzer.py CHANGED
@@ -663,8 +663,9 @@ def test_run_primary_analysis_image_dict(tmp_path):
663
  )
664
 
665
  assert result.image.master == image_info.original_url
666
- assert result.image.width == image_info.derivative_width
667
- assert result.image.height == image_info.derivative_height
 
668
 
669
 
670
  def test_run_primary_analysis_regions_in_layout(tmp_path):
@@ -866,3 +867,98 @@ def test_run_primary_analysis_invalid_region_skipped(tmp_path):
866
 
867
  assert len(result.layout["regions"]) == 1
868
  assert result.layout["regions"][0]["id"] == "r_good"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
663
  )
664
 
665
  assert result.image.master == image_info.original_url
666
+ # L'analyzer stocke désormais les dimensions originales (pas celles du dérivé)
667
+ assert result.image.width == image_info.original_width
668
+ assert result.image.height == image_info.original_height
669
 
670
 
671
  def test_run_primary_analysis_regions_in_layout(tmp_path):
 
867
 
868
  assert len(result.layout["regions"]) == 1
869
  assert result.layout["regions"][0]["id"] == "r_good"
870
+
871
+
872
+ # ---------------------------------------------------------------------------
873
+ # Mode IIIF natif — bytes en mémoire
874
+ # ---------------------------------------------------------------------------
875
+
876
+ from app.schemas.image import ImageSourceInfo
877
+
878
+
879
+ def _make_image_source_info() -> ImageSourceInfo:
880
+ return ImageSourceInfo(
881
+ original_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29/full/max/0/default.jpg",
882
+ iiif_service_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29",
883
+ manifest_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/manifest.json",
884
+ is_iiif=True,
885
+ original_width=3543,
886
+ original_height=4724,
887
+ )
888
+
889
+
890
+ def test_run_primary_analysis_iiif_bytes_mode(tmp_path):
891
+ """Mode IIIF natif : passe des bytes directement, pas de chemin fichier."""
892
+ prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
893
+ _setup_prompt_file(tmp_path, prompt_rel)
894
+
895
+ jpeg_bytes = _make_jpeg_bytes(200, 300)
896
+ mock_provider = _make_mock_provider(_valid_ai_json())
897
+
898
+ with patch("app.services.ai.analyzer.get_provider", return_value=mock_provider):
899
+ result = run_primary_analysis(
900
+ derivative_image_bytes=jpeg_bytes,
901
+ derivative_width=200,
902
+ derivative_height=300,
903
+ corpus_profile=_make_corpus_profile(prompt_rel_path=prompt_rel),
904
+ model_config=_make_model_config(),
905
+ page_id="test-iiif-0001r",
906
+ manuscript_id="ms-test",
907
+ corpus_slug="test-corpus",
908
+ folio_label="0001r",
909
+ sequence=1,
910
+ image_info=_make_image_source_info(),
911
+ base_data_dir=tmp_path / "data",
912
+ project_root=tmp_path,
913
+ )
914
+
915
+ assert result.image.iiif_service_url == "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29"
916
+ assert result.image.manifest_url is not None
917
+ assert result.image.derivative_web is None
918
+ assert result.image.width == 3543 # dimensions originales, pas dérivé
919
+ assert result.image.height == 4724
920
+
921
+
922
+ def test_run_primary_analysis_iiif_bbox_scaling(tmp_path):
923
+ """Les bbox sont mises à l'échelle du dérivé vers le canvas original."""
924
+ prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
925
+ _setup_prompt_file(tmp_path, prompt_rel)
926
+
927
+ # Image source : 4000x6000 original, dérivé 1000x1500
928
+ source_info = ImageSourceInfo(
929
+ original_url="https://example.com/img",
930
+ iiif_service_url="https://example.com/img",
931
+ is_iiif=True,
932
+ original_width=4000,
933
+ original_height=6000,
934
+ )
935
+
936
+ # Réponse IA avec bbox dans l'espace du dérivé (1000x1500)
937
+ ai_response = json.dumps({
938
+ "layout": {"regions": [
939
+ {"id": "r1", "type": "text_block", "bbox": [100, 200, 500, 300], "confidence": 0.9},
940
+ ]},
941
+ "ocr": {"diplomatic_text": "test", "language": "la", "confidence": 0.8},
942
+ })
943
+ mock_provider = _make_mock_provider(ai_response)
944
+
945
+ with patch("app.services.ai.analyzer.get_provider", return_value=mock_provider):
946
+ result = run_primary_analysis(
947
+ derivative_image_bytes=_make_jpeg_bytes(100, 150),
948
+ derivative_width=1000,
949
+ derivative_height=1500,
950
+ corpus_profile=_make_corpus_profile(prompt_rel_path=prompt_rel),
951
+ model_config=_make_model_config(),
952
+ page_id="test-scale-0001r",
953
+ manuscript_id="ms-test",
954
+ corpus_slug="test-corpus",
955
+ folio_label="0001r",
956
+ sequence=1,
957
+ image_info=source_info,
958
+ base_data_dir=tmp_path / "data",
959
+ project_root=tmp_path,
960
+ )
961
+
962
+ # Scale factor : 4000/1000 = 4.0, 6000/1500 = 4.0
963
+ bbox = result.layout["regions"][0]["bbox"]
964
+ assert bbox == [400, 800, 2000, 1200] # 100*4, 200*4, 500*4, 300*4