Spaces:
Build error
feat(pipeline): IIIF-native Sprint 3 — in-memory image fetch for AI analysis
Browse filesReplace disk-based image pipeline with IIIF-native streaming for pages
that have a IIIF Image Service URL:
- fetch_iiif_derivative(): fetches a 1500px JPEG via IIIF Image API
({service_url}/full/!1500,1500/0/default.jpg) — server-side resize
- fetch_ai_derivative_bytes(): returns (jpeg_bytes, w, h) in memory,
never writes to disk. Falls back to full download + in-memory resize
for non-IIIF sources.
- analyzer.py: now accepts derivative_image_bytes (bytes) OR
derivative_image_path (Path). New _scale_bbox_coordinates() scales
bounding boxes from derivative space to original canvas space when
dimensions differ (e.g., 1500px derivative → 5000px original).
- job_runner.py: 3 code paths:
1. IIIF native (iiif_service_url set): fetch in memory → analyze → discard
2. URL fallback (legacy): download → create_derivatives on disk → analyze
3. Local file (upload): read → create_derivatives on disk → analyze
- ImageInfo in master.json now stores iiif_service_url + original canvas
dimensions (not derivative dimensions) for IIIF-native pages.
2 new tests: IIIF bytes mode, bbox coordinate scaling (4x factor).
585 tests pass (+2 new), 0 regressions.
https://claude.ai/code/session_01UB4he7RdRPHLvNjky4X8Sw
|
@@ -11,7 +11,7 @@ from pathlib import Path
|
|
| 11 |
|
| 12 |
# 3. local
|
| 13 |
from app.schemas.corpus_profile import CorpusProfile
|
| 14 |
-
from app.schemas.image import ImageDerivativeInfo
|
| 15 |
from app.schemas.model_config import ModelConfig
|
| 16 |
from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
|
| 17 |
from app.services.ai.master_writer import write_ai_raw, write_master_json
|
|
@@ -22,8 +22,32 @@ from app.services.ai.response_parser import ParseError, parse_ai_response # noq
|
|
| 22 |
logger = logging.getLogger(__name__)
|
| 23 |
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def run_primary_analysis(
|
| 26 |
-
|
|
|
|
|
|
|
| 27 |
corpus_profile: CorpusProfile,
|
| 28 |
model_config: ModelConfig,
|
| 29 |
page_id: str,
|
|
@@ -31,38 +55,22 @@ def run_primary_analysis(
|
|
| 31 |
corpus_slug: str,
|
| 32 |
folio_label: str,
|
| 33 |
sequence: int,
|
| 34 |
-
image_info: ImageDerivativeInfo,
|
|
|
|
|
|
|
| 35 |
base_data_dir: Path = Path("data"),
|
| 36 |
project_root: Path = Path("."),
|
| 37 |
) -> PageMaster:
|
| 38 |
"""Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
corpus_profile: profil du corpus (pilote le prompt et les layers).
|
| 49 |
-
model_config: configuration du modèle sélectionné (provider + model_id).
|
| 50 |
-
page_id: identifiant unique de la page (ex. "beatus-lat8878-0013r").
|
| 51 |
-
manuscript_id: identifiant du manuscrit.
|
| 52 |
-
corpus_slug: identifiant du corpus (ex. "beatus-lat8878").
|
| 53 |
-
folio_label: label du folio (ex. "0013r").
|
| 54 |
-
sequence: numéro de séquence dans le manuscrit.
|
| 55 |
-
image_info: métadonnées de l'image normalisée (dimensions, chemins).
|
| 56 |
-
base_data_dir: racine du dossier data.
|
| 57 |
-
project_root: racine du projet (pour résoudre les chemins des prompts).
|
| 58 |
-
|
| 59 |
-
Returns:
|
| 60 |
-
PageMaster validé (ai_raw.json et master.json écrits sur disque).
|
| 61 |
-
|
| 62 |
-
Raises:
|
| 63 |
-
ParseError: si la réponse IA n'est pas un JSON valide.
|
| 64 |
-
FileNotFoundError: si le template de prompt est introuvable.
|
| 65 |
-
RuntimeError: si le provider n'est pas configuré (variable d'env absente).
|
| 66 |
"""
|
| 67 |
# ── Chemins de sortie ───────────────────────────────────────────────────
|
| 68 |
page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
|
|
@@ -85,13 +93,18 @@ def run_primary_analysis(
|
|
| 85 |
extra={"template": prompt_rel_path, "corpus": corpus_slug, "folio": folio_label},
|
| 86 |
)
|
| 87 |
|
| 88 |
-
# ── 2.
|
| 89 |
-
if not
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
# ── 3. Appel IA via le provider sélectionné ─────────────────────────────
|
| 97 |
provider = get_provider(model_config.provider)
|
|
@@ -116,21 +129,45 @@ def run_primary_analysis(
|
|
| 116 |
# ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
|
| 117 |
layout, ocr = parse_ai_response(raw_text)
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
# ── 6. Construction du PageMaster ───────────────────────────────────────
|
| 120 |
processed_at = datetime.now(tz=timezone.utc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
page_master = PageMaster(
|
| 122 |
page_id=page_id,
|
| 123 |
corpus_profile=corpus_profile.profile_id,
|
| 124 |
manuscript_id=manuscript_id,
|
| 125 |
folio_label=folio_label,
|
| 126 |
sequence=sequence,
|
| 127 |
-
image=
|
| 128 |
-
master=image_info.original_url,
|
| 129 |
-
derivative_web=image_info.derivative_path,
|
| 130 |
-
thumbnail=image_info.thumbnail_path,
|
| 131 |
-
width=image_info.derivative_width,
|
| 132 |
-
height=image_info.derivative_height,
|
| 133 |
-
),
|
| 134 |
layout=layout,
|
| 135 |
ocr=ocr,
|
| 136 |
processing=ProcessingInfo(
|
|
@@ -154,6 +191,7 @@ def run_primary_analysis(
|
|
| 154 |
"corpus": corpus_slug,
|
| 155 |
"folio": folio_label,
|
| 156 |
"regions": len(layout.get("regions", [])),
|
|
|
|
| 157 |
},
|
| 158 |
)
|
| 159 |
return page_master
|
|
|
|
| 11 |
|
| 12 |
# 3. local
|
| 13 |
from app.schemas.corpus_profile import CorpusProfile
|
| 14 |
+
from app.schemas.image import ImageDerivativeInfo, ImageSourceInfo
|
| 15 |
from app.schemas.model_config import ModelConfig
|
| 16 |
from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, PageMaster, ProcessingInfo
|
| 17 |
from app.services.ai.master_writer import write_ai_raw, write_master_json
|
|
|
|
| 22 |
logger = logging.getLogger(__name__)
|
| 23 |
|
| 24 |
|
| 25 |
+
def _scale_bbox_coordinates(layout: dict, scale_x: float, scale_y: float) -> dict:
|
| 26 |
+
"""Met à l'échelle les bbox de l'espace dérivé vers l'espace canvas original.
|
| 27 |
+
|
| 28 |
+
L'IA analyse un dérivé 1500px mais les coordonnées dans master.json
|
| 29 |
+
doivent être en pixels absolus du canvas original (convention IIIF).
|
| 30 |
+
"""
|
| 31 |
+
if abs(scale_x - 1.0) < 0.01 and abs(scale_y - 1.0) < 0.01:
|
| 32 |
+
return layout # pas de scaling nécessaire
|
| 33 |
+
|
| 34 |
+
regions = layout.get("regions", [])
|
| 35 |
+
for region in regions:
|
| 36 |
+
bbox = region.get("bbox")
|
| 37 |
+
if bbox and len(bbox) == 4:
|
| 38 |
+
region["bbox"] = [
|
| 39 |
+
round(bbox[0] * scale_x),
|
| 40 |
+
round(bbox[1] * scale_y),
|
| 41 |
+
round(bbox[2] * scale_x),
|
| 42 |
+
round(bbox[3] * scale_y),
|
| 43 |
+
]
|
| 44 |
+
return layout
|
| 45 |
+
|
| 46 |
+
|
| 47 |
def run_primary_analysis(
|
| 48 |
+
*,
|
| 49 |
+
derivative_image_bytes: bytes | None = None,
|
| 50 |
+
derivative_image_path: Path | None = None,
|
| 51 |
corpus_profile: CorpusProfile,
|
| 52 |
model_config: ModelConfig,
|
| 53 |
page_id: str,
|
|
|
|
| 55 |
corpus_slug: str,
|
| 56 |
folio_label: str,
|
| 57 |
sequence: int,
|
| 58 |
+
image_info: ImageDerivativeInfo | ImageSourceInfo,
|
| 59 |
+
derivative_width: int | None = None,
|
| 60 |
+
derivative_height: int | None = None,
|
| 61 |
base_data_dir: Path = Path("data"),
|
| 62 |
project_root: Path = Path("."),
|
| 63 |
) -> PageMaster:
|
| 64 |
"""Analyse primaire d'un folio : charge le prompt, appelle l'IA, écrit les fichiers.
|
| 65 |
|
| 66 |
+
Supporte deux modes :
|
| 67 |
+
- IIIF natif : derivative_image_bytes fourni (bytes en RAM, jamais sur disque)
|
| 68 |
+
- Legacy : derivative_image_path fourni (chemin fichier sur disque)
|
| 69 |
+
|
| 70 |
+
Respecte R05 : ai_raw.json toujours écrit en premier.
|
| 71 |
+
|
| 72 |
+
Si les dimensions originales (canvas) diffèrent du dérivé, les bbox sont
|
| 73 |
+
mises à l'échelle de l'espace dérivé vers l'espace canvas original.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
"""
|
| 75 |
# ── Chemins de sortie ───────────────────────────────────────────────────
|
| 76 |
page_dir = base_data_dir / "corpora" / corpus_slug / "pages" / folio_label
|
|
|
|
| 93 |
extra={"template": prompt_rel_path, "corpus": corpus_slug, "folio": folio_label},
|
| 94 |
)
|
| 95 |
|
| 96 |
+
# ── 2. Obtention des bytes image ────────────────────────────────────────
|
| 97 |
+
if derivative_image_bytes is not None:
|
| 98 |
+
jpeg_bytes = derivative_image_bytes
|
| 99 |
+
elif derivative_image_path is not None:
|
| 100 |
+
if not derivative_image_path.exists():
|
| 101 |
+
raise FileNotFoundError(f"Image dérivée introuvable : {derivative_image_path}")
|
| 102 |
+
try:
|
| 103 |
+
jpeg_bytes = derivative_image_path.read_bytes()
|
| 104 |
+
except OSError as exc:
|
| 105 |
+
raise RuntimeError(f"Erreur lecture image {derivative_image_path} : {exc}") from exc
|
| 106 |
+
else:
|
| 107 |
+
raise ValueError("Il faut fournir derivative_image_bytes ou derivative_image_path")
|
| 108 |
|
| 109 |
# ── 3. Appel IA via le provider sélectionné ─────────────────────────────
|
| 110 |
provider = get_provider(model_config.provider)
|
|
|
|
| 129 |
# ── 5. Parsing + validation (ParseError si JSON invalide) ───────────────
|
| 130 |
layout, ocr = parse_ai_response(raw_text)
|
| 131 |
|
| 132 |
+
# ── 5b. Scaling bbox si les dimensions originales diffèrent du dérivé ──
|
| 133 |
+
is_iiif_source = isinstance(image_info, ImageSourceInfo)
|
| 134 |
+
original_w = image_info.original_width
|
| 135 |
+
original_h = image_info.original_height
|
| 136 |
+
deriv_w = derivative_width or (getattr(image_info, "derivative_width", None)) or original_w
|
| 137 |
+
deriv_h = derivative_height or (getattr(image_info, "derivative_height", None)) or original_h
|
| 138 |
+
|
| 139 |
+
if original_w > 0 and deriv_w > 0 and (original_w != deriv_w or original_h != deriv_h):
|
| 140 |
+
scale_x = original_w / deriv_w
|
| 141 |
+
scale_y = original_h / deriv_h
|
| 142 |
+
layout = _scale_bbox_coordinates(layout, scale_x, scale_y)
|
| 143 |
+
|
| 144 |
# ── 6. Construction du PageMaster ───────────────────────────────────────
|
| 145 |
processed_at = datetime.now(tz=timezone.utc)
|
| 146 |
+
|
| 147 |
+
if is_iiif_source:
|
| 148 |
+
image_block = ImageInfo(
|
| 149 |
+
master=image_info.original_url,
|
| 150 |
+
iiif_service_url=image_info.iiif_service_url,
|
| 151 |
+
manifest_url=image_info.manifest_url,
|
| 152 |
+
width=original_w,
|
| 153 |
+
height=original_h,
|
| 154 |
+
)
|
| 155 |
+
else:
|
| 156 |
+
image_block = ImageInfo(
|
| 157 |
+
master=image_info.original_url,
|
| 158 |
+
derivative_web=getattr(image_info, "derivative_path", None),
|
| 159 |
+
thumbnail=getattr(image_info, "thumbnail_path", None),
|
| 160 |
+
width=original_w,
|
| 161 |
+
height=original_h,
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
page_master = PageMaster(
|
| 165 |
page_id=page_id,
|
| 166 |
corpus_profile=corpus_profile.profile_id,
|
| 167 |
manuscript_id=manuscript_id,
|
| 168 |
folio_label=folio_label,
|
| 169 |
sequence=sequence,
|
| 170 |
+
image=image_block,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
layout=layout,
|
| 172 |
ocr=ocr,
|
| 173 |
processing=ProcessingInfo(
|
|
|
|
| 191 |
"corpus": corpus_slug,
|
| 192 |
"folio": folio_label,
|
| 193 |
"regions": len(layout.get("regions", [])),
|
| 194 |
+
"iiif_native": is_iiif_source,
|
| 195 |
},
|
| 196 |
)
|
| 197 |
return page_master
|
|
@@ -143,3 +143,55 @@ def fetch_and_normalize(
|
|
| 143 |
"""
|
| 144 |
source_bytes = fetch_iiif_image(url)
|
| 145 |
return create_derivatives(source_bytes, url, corpus_slug, folio_label, base_data_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
"""
|
| 144 |
source_bytes = fetch_iiif_image(url)
|
| 145 |
return create_derivatives(source_bytes, url, corpus_slug, folio_label, base_data_dir)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ── Mode IIIF natif : images en mémoire, jamais sur disque ───────────────────
|
| 149 |
+
|
| 150 |
+
def fetch_ai_derivative_bytes(
|
| 151 |
+
iiif_service_url: str | None,
|
| 152 |
+
fallback_url: str | None,
|
| 153 |
+
) -> tuple[bytes, int, int]:
|
| 154 |
+
"""Retourne (jpeg_bytes, width, height) pour l'IA — jamais sauvé sur disque.
|
| 155 |
+
|
| 156 |
+
- Si iiif_service_url est fourni : utilise l'IIIF Image API pour demander
|
| 157 |
+
au serveur un dérivé 1500px directement redimensionné côté serveur.
|
| 158 |
+
- Sinon (fallback_url) : télécharge l'image complète et redimensionne
|
| 159 |
+
en mémoire.
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
Tuple (jpeg_bytes, derivative_width, derivative_height).
|
| 163 |
+
|
| 164 |
+
Raises:
|
| 165 |
+
ValueError: si aucune source n'est fournie.
|
| 166 |
+
httpx.HTTPStatusError: si le serveur retourne une erreur.
|
| 167 |
+
"""
|
| 168 |
+
from app.services.ingest.iiif_fetcher import fetch_iiif_derivative, fetch_iiif_image
|
| 169 |
+
|
| 170 |
+
if iiif_service_url:
|
| 171 |
+
raw_bytes = fetch_iiif_derivative(iiif_service_url, max_px=_MAX_DERIVATIVE_PX)
|
| 172 |
+
elif fallback_url:
|
| 173 |
+
raw_bytes = fetch_iiif_image(fallback_url)
|
| 174 |
+
else:
|
| 175 |
+
raise ValueError("Aucune source image fournie (ni iiif_service_url ni fallback_url)")
|
| 176 |
+
|
| 177 |
+
# Ouvrir en mémoire pour obtenir les dimensions (et redimensionner si fallback)
|
| 178 |
+
image = Image.open(io.BytesIO(raw_bytes))
|
| 179 |
+
if image.mode != "RGB":
|
| 180 |
+
image = image.convert("RGB")
|
| 181 |
+
|
| 182 |
+
if not iiif_service_url:
|
| 183 |
+
# Fallback : le serveur n'a pas redimensionné, on le fait en mémoire
|
| 184 |
+
image = _resize_to_max(image, _MAX_DERIVATIVE_PX)
|
| 185 |
+
|
| 186 |
+
w, h = image.size
|
| 187 |
+
|
| 188 |
+
# Encoder en JPEG en mémoire
|
| 189 |
+
buf = io.BytesIO()
|
| 190 |
+
image.save(buf, format="JPEG", quality=_DERIVATIVE_QUALITY)
|
| 191 |
+
jpeg_bytes = buf.getvalue()
|
| 192 |
+
|
| 193 |
+
logger.info(
|
| 194 |
+
"Dérivé IA en mémoire",
|
| 195 |
+
extra={"iiif": bool(iiif_service_url), "size": f"{w}x{h}", "bytes": len(jpeg_bytes)},
|
| 196 |
+
)
|
| 197 |
+
return jpeg_bytes, w, h
|
|
@@ -48,3 +48,38 @@ def fetch_iiif_image(url: str, timeout: float = _DEFAULT_TIMEOUT) -> bytes:
|
|
| 48 |
extra={"url": url, "size_bytes": len(response.content)},
|
| 49 |
)
|
| 50 |
return response.content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
extra={"url": url, "size_bytes": len(response.content)},
|
| 49 |
)
|
| 50 |
return response.content
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def fetch_iiif_derivative(
|
| 54 |
+
service_url: str,
|
| 55 |
+
max_px: int = 1500,
|
| 56 |
+
timeout: float = _DEFAULT_TIMEOUT,
|
| 57 |
+
) -> bytes:
|
| 58 |
+
"""Télécharge un dérivé via l'IIIF Image API — jamais stocké sur disque.
|
| 59 |
+
|
| 60 |
+
Construit l'URL : {service_url}/full/!{max_px},{max_px}/0/default.jpg
|
| 61 |
+
Le serveur IIIF retourne une image redimensionnée côté serveur.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
service_url: URL du IIIF Image Service (sans le suffix /full/.../default.jpg).
|
| 65 |
+
max_px: taille max du grand côté (défaut : 1500).
|
| 66 |
+
timeout: délai maximal en secondes.
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
Contenu brut de l'image JPEG en bytes.
|
| 70 |
+
"""
|
| 71 |
+
# Pattern IIIF Image API : !w,h = "best fit" (le serveur choisit)
|
| 72 |
+
derivative_url = f"{service_url.rstrip('/')}/full/!{max_px},{max_px}/0/default.jpg"
|
| 73 |
+
logger.info("Fetching IIIF derivative", extra={"url": derivative_url, "max_px": max_px})
|
| 74 |
+
response = httpx.get(
|
| 75 |
+
derivative_url,
|
| 76 |
+
headers=_HEADERS,
|
| 77 |
+
follow_redirects=True,
|
| 78 |
+
timeout=httpx.Timeout(timeout, connect=10.0),
|
| 79 |
+
)
|
| 80 |
+
response.raise_for_status()
|
| 81 |
+
logger.info(
|
| 82 |
+
"IIIF derivative fetched",
|
| 83 |
+
extra={"url": derivative_url, "size_bytes": len(response.content)},
|
| 84 |
+
)
|
| 85 |
+
return response.content
|
|
@@ -32,8 +32,13 @@ from app.models.database import async_session_factory
|
|
| 32 |
from app.models.job import JobModel
|
| 33 |
from app.models.model_config_db import ModelConfigDB
|
| 34 |
from app.schemas.corpus_profile import CorpusProfile
|
|
|
|
| 35 |
from app.schemas.model_config import ModelConfig, ProviderType
|
| 36 |
-
from app.services.image.normalizer import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
logger = logging.getLogger(__name__)
|
| 39 |
|
|
@@ -126,19 +131,65 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
|
|
| 126 |
available_models=[],
|
| 127 |
)
|
| 128 |
|
| 129 |
-
# ── 5.
|
| 130 |
data_dir = _config_module.settings.data_dir
|
| 131 |
image_source = page.image_master_path or ""
|
| 132 |
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
image_info = fetch_and_normalize(
|
| 135 |
image_source, corpus.slug, page.folio_label, data_dir
|
| 136 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
elif image_source:
|
| 138 |
-
#
|
| 139 |
-
# sous data_dir. Empêche la lecture de fichiers arbitraires
|
| 140 |
-
# si image_master_path contient des séquences ../ ou un
|
| 141 |
-
# chemin absolu hors du répertoire de données.
|
| 142 |
source_path = Path(image_source).resolve()
|
| 143 |
data_dir_resolved = data_dir.resolve()
|
| 144 |
if not str(source_path).startswith(str(data_dir_resolved) + "/") and source_path != data_dir_resolved:
|
|
@@ -150,29 +201,26 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
|
|
| 150 |
image_info = create_derivatives(
|
| 151 |
source_bytes, image_source, corpus.slug, page.folio_label, data_dir
|
| 152 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
else:
|
| 154 |
raise ValueError(
|
| 155 |
f"La page {page.id} n'a pas d'image source "
|
| 156 |
-
"(
|
| 157 |
)
|
| 158 |
|
| 159 |
-
# ── 6. Analyse primaire IA (R05 : double stockage) ───────────────────
|
| 160 |
-
from app.services.ai.analyzer import run_primary_analysis
|
| 161 |
-
|
| 162 |
-
page_master = run_primary_analysis(
|
| 163 |
-
derivative_image_path=Path(image_info.derivative_path),
|
| 164 |
-
corpus_profile=corpus_profile,
|
| 165 |
-
model_config=model_config,
|
| 166 |
-
page_id=page.id,
|
| 167 |
-
manuscript_id=manuscript.id,
|
| 168 |
-
corpus_slug=corpus.slug,
|
| 169 |
-
folio_label=page.folio_label,
|
| 170 |
-
sequence=page.sequence,
|
| 171 |
-
image_info=image_info,
|
| 172 |
-
base_data_dir=data_dir,
|
| 173 |
-
project_root=_PROJECT_ROOT,
|
| 174 |
-
)
|
| 175 |
-
|
| 176 |
# ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
|
| 177 |
from app.services.export.alto import generate_alto, write_alto
|
| 178 |
|
|
|
|
| 32 |
from app.models.job import JobModel
|
| 33 |
from app.models.model_config_db import ModelConfigDB
|
| 34 |
from app.schemas.corpus_profile import CorpusProfile
|
| 35 |
+
from app.schemas.image import ImageSourceInfo
|
| 36 |
from app.schemas.model_config import ModelConfig, ProviderType
|
| 37 |
+
from app.services.image.normalizer import (
|
| 38 |
+
create_derivatives,
|
| 39 |
+
fetch_ai_derivative_bytes,
|
| 40 |
+
fetch_and_normalize,
|
| 41 |
+
)
|
| 42 |
|
| 43 |
logger = logging.getLogger(__name__)
|
| 44 |
|
|
|
|
| 131 |
available_models=[],
|
| 132 |
)
|
| 133 |
|
| 134 |
+
# ── 5. Obtenir l'image pour l'IA ─────────────────────────────────────
|
| 135 |
data_dir = _config_module.settings.data_dir
|
| 136 |
image_source = page.image_master_path or ""
|
| 137 |
|
| 138 |
+
from app.services.ai.analyzer import run_primary_analysis
|
| 139 |
+
|
| 140 |
+
if page.iiif_service_url:
|
| 141 |
+
# ── Mode IIIF natif : fetch en mémoire, zéro stockage ────────────
|
| 142 |
+
deriv_bytes, deriv_w, deriv_h = fetch_ai_derivative_bytes(
|
| 143 |
+
iiif_service_url=page.iiif_service_url,
|
| 144 |
+
fallback_url=None,
|
| 145 |
+
)
|
| 146 |
+
image_source_info = ImageSourceInfo(
|
| 147 |
+
original_url=image_source or page.iiif_service_url,
|
| 148 |
+
iiif_service_url=page.iiif_service_url,
|
| 149 |
+
manifest_url=page.manifest_url,
|
| 150 |
+
is_iiif=True,
|
| 151 |
+
original_width=page.canvas_width or deriv_w,
|
| 152 |
+
original_height=page.canvas_height or deriv_h,
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# ── 6. Analyse primaire IA (R05 : double stockage) ───────────────
|
| 156 |
+
page_master = run_primary_analysis(
|
| 157 |
+
derivative_image_bytes=deriv_bytes,
|
| 158 |
+
derivative_width=deriv_w,
|
| 159 |
+
derivative_height=deriv_h,
|
| 160 |
+
corpus_profile=corpus_profile,
|
| 161 |
+
model_config=model_config,
|
| 162 |
+
page_id=page.id,
|
| 163 |
+
manuscript_id=manuscript.id,
|
| 164 |
+
corpus_slug=corpus.slug,
|
| 165 |
+
folio_label=page.folio_label,
|
| 166 |
+
sequence=page.sequence,
|
| 167 |
+
image_info=image_source_info,
|
| 168 |
+
base_data_dir=data_dir,
|
| 169 |
+
project_root=_PROJECT_ROOT,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
elif image_source.startswith(("http://", "https://")):
|
| 173 |
+
# ── Mode fallback URL : télécharge + stocke sur disque (legacy) ──
|
| 174 |
image_info = fetch_and_normalize(
|
| 175 |
image_source, corpus.slug, page.folio_label, data_dir
|
| 176 |
)
|
| 177 |
+
page_master = run_primary_analysis(
|
| 178 |
+
derivative_image_path=Path(image_info.derivative_path),
|
| 179 |
+
corpus_profile=corpus_profile,
|
| 180 |
+
model_config=model_config,
|
| 181 |
+
page_id=page.id,
|
| 182 |
+
manuscript_id=manuscript.id,
|
| 183 |
+
corpus_slug=corpus.slug,
|
| 184 |
+
folio_label=page.folio_label,
|
| 185 |
+
sequence=page.sequence,
|
| 186 |
+
image_info=image_info,
|
| 187 |
+
base_data_dir=data_dir,
|
| 188 |
+
project_root=_PROJECT_ROOT,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
elif image_source:
|
| 192 |
+
# ── Mode fichier local (upload) ──────────────────────────────────
|
|
|
|
|
|
|
|
|
|
| 193 |
source_path = Path(image_source).resolve()
|
| 194 |
data_dir_resolved = data_dir.resolve()
|
| 195 |
if not str(source_path).startswith(str(data_dir_resolved) + "/") and source_path != data_dir_resolved:
|
|
|
|
| 201 |
image_info = create_derivatives(
|
| 202 |
source_bytes, image_source, corpus.slug, page.folio_label, data_dir
|
| 203 |
)
|
| 204 |
+
page_master = run_primary_analysis(
|
| 205 |
+
derivative_image_path=Path(image_info.derivative_path),
|
| 206 |
+
corpus_profile=corpus_profile,
|
| 207 |
+
model_config=model_config,
|
| 208 |
+
page_id=page.id,
|
| 209 |
+
manuscript_id=manuscript.id,
|
| 210 |
+
corpus_slug=corpus.slug,
|
| 211 |
+
folio_label=page.folio_label,
|
| 212 |
+
sequence=page.sequence,
|
| 213 |
+
image_info=image_info,
|
| 214 |
+
base_data_dir=data_dir,
|
| 215 |
+
project_root=_PROJECT_ROOT,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
else:
|
| 219 |
raise ValueError(
|
| 220 |
f"La page {page.id} n'a pas d'image source "
|
| 221 |
+
"(ni iiif_service_url, ni image_master_path)"
|
| 222 |
)
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
# ── 7. Générer et écrire l'ALTO XML ──────────────────────────────────
|
| 225 |
from app.services.export.alto import generate_alto, write_alto
|
| 226 |
|
|
@@ -663,8 +663,9 @@ def test_run_primary_analysis_image_dict(tmp_path):
|
|
| 663 |
)
|
| 664 |
|
| 665 |
assert result.image.master == image_info.original_url
|
| 666 |
-
|
| 667 |
-
assert result.image.
|
|
|
|
| 668 |
|
| 669 |
|
| 670 |
def test_run_primary_analysis_regions_in_layout(tmp_path):
|
|
@@ -866,3 +867,98 @@ def test_run_primary_analysis_invalid_region_skipped(tmp_path):
|
|
| 866 |
|
| 867 |
assert len(result.layout["regions"]) == 1
|
| 868 |
assert result.layout["regions"][0]["id"] == "r_good"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 663 |
)
|
| 664 |
|
| 665 |
assert result.image.master == image_info.original_url
|
| 666 |
+
# L'analyzer stocke désormais les dimensions originales (pas celles du dérivé)
|
| 667 |
+
assert result.image.width == image_info.original_width
|
| 668 |
+
assert result.image.height == image_info.original_height
|
| 669 |
|
| 670 |
|
| 671 |
def test_run_primary_analysis_regions_in_layout(tmp_path):
|
|
|
|
| 867 |
|
| 868 |
assert len(result.layout["regions"]) == 1
|
| 869 |
assert result.layout["regions"][0]["id"] == "r_good"
|
| 870 |
+
|
| 871 |
+
|
| 872 |
+
# ---------------------------------------------------------------------------
|
| 873 |
+
# Mode IIIF natif — bytes en mémoire
|
| 874 |
+
# ---------------------------------------------------------------------------
|
| 875 |
+
|
| 876 |
+
from app.schemas.image import ImageSourceInfo
|
| 877 |
+
|
| 878 |
+
|
| 879 |
+
def _make_image_source_info() -> ImageSourceInfo:
|
| 880 |
+
return ImageSourceInfo(
|
| 881 |
+
original_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29/full/max/0/default.jpg",
|
| 882 |
+
iiif_service_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29",
|
| 883 |
+
manifest_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/manifest.json",
|
| 884 |
+
is_iiif=True,
|
| 885 |
+
original_width=3543,
|
| 886 |
+
original_height=4724,
|
| 887 |
+
)
|
| 888 |
+
|
| 889 |
+
|
| 890 |
+
def test_run_primary_analysis_iiif_bytes_mode(tmp_path):
|
| 891 |
+
"""Mode IIIF natif : passe des bytes directement, pas de chemin fichier."""
|
| 892 |
+
prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
|
| 893 |
+
_setup_prompt_file(tmp_path, prompt_rel)
|
| 894 |
+
|
| 895 |
+
jpeg_bytes = _make_jpeg_bytes(200, 300)
|
| 896 |
+
mock_provider = _make_mock_provider(_valid_ai_json())
|
| 897 |
+
|
| 898 |
+
with patch("app.services.ai.analyzer.get_provider", return_value=mock_provider):
|
| 899 |
+
result = run_primary_analysis(
|
| 900 |
+
derivative_image_bytes=jpeg_bytes,
|
| 901 |
+
derivative_width=200,
|
| 902 |
+
derivative_height=300,
|
| 903 |
+
corpus_profile=_make_corpus_profile(prompt_rel_path=prompt_rel),
|
| 904 |
+
model_config=_make_model_config(),
|
| 905 |
+
page_id="test-iiif-0001r",
|
| 906 |
+
manuscript_id="ms-test",
|
| 907 |
+
corpus_slug="test-corpus",
|
| 908 |
+
folio_label="0001r",
|
| 909 |
+
sequence=1,
|
| 910 |
+
image_info=_make_image_source_info(),
|
| 911 |
+
base_data_dir=tmp_path / "data",
|
| 912 |
+
project_root=tmp_path,
|
| 913 |
+
)
|
| 914 |
+
|
| 915 |
+
assert result.image.iiif_service_url == "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29"
|
| 916 |
+
assert result.image.manifest_url is not None
|
| 917 |
+
assert result.image.derivative_web is None
|
| 918 |
+
assert result.image.width == 3543 # dimensions originales, pas dérivé
|
| 919 |
+
assert result.image.height == 4724
|
| 920 |
+
|
| 921 |
+
|
| 922 |
+
def test_run_primary_analysis_iiif_bbox_scaling(tmp_path):
|
| 923 |
+
"""Les bbox sont mises à l'échelle du dérivé vers le canvas original."""
|
| 924 |
+
prompt_rel = "prompts/medieval-illuminated/primary_v1.txt"
|
| 925 |
+
_setup_prompt_file(tmp_path, prompt_rel)
|
| 926 |
+
|
| 927 |
+
# Image source : 4000x6000 original, dérivé 1000x1500
|
| 928 |
+
source_info = ImageSourceInfo(
|
| 929 |
+
original_url="https://example.com/img",
|
| 930 |
+
iiif_service_url="https://example.com/img",
|
| 931 |
+
is_iiif=True,
|
| 932 |
+
original_width=4000,
|
| 933 |
+
original_height=6000,
|
| 934 |
+
)
|
| 935 |
+
|
| 936 |
+
# Réponse IA avec bbox dans l'espace du dérivé (1000x1500)
|
| 937 |
+
ai_response = json.dumps({
|
| 938 |
+
"layout": {"regions": [
|
| 939 |
+
{"id": "r1", "type": "text_block", "bbox": [100, 200, 500, 300], "confidence": 0.9},
|
| 940 |
+
]},
|
| 941 |
+
"ocr": {"diplomatic_text": "test", "language": "la", "confidence": 0.8},
|
| 942 |
+
})
|
| 943 |
+
mock_provider = _make_mock_provider(ai_response)
|
| 944 |
+
|
| 945 |
+
with patch("app.services.ai.analyzer.get_provider", return_value=mock_provider):
|
| 946 |
+
result = run_primary_analysis(
|
| 947 |
+
derivative_image_bytes=_make_jpeg_bytes(100, 150),
|
| 948 |
+
derivative_width=1000,
|
| 949 |
+
derivative_height=1500,
|
| 950 |
+
corpus_profile=_make_corpus_profile(prompt_rel_path=prompt_rel),
|
| 951 |
+
model_config=_make_model_config(),
|
| 952 |
+
page_id="test-scale-0001r",
|
| 953 |
+
manuscript_id="ms-test",
|
| 954 |
+
corpus_slug="test-corpus",
|
| 955 |
+
folio_label="0001r",
|
| 956 |
+
sequence=1,
|
| 957 |
+
image_info=source_info,
|
| 958 |
+
base_data_dir=tmp_path / "data",
|
| 959 |
+
project_root=tmp_path,
|
| 960 |
+
)
|
| 961 |
+
|
| 962 |
+
# Scale factor : 4000/1000 = 4.0, 6000/1500 = 4.0
|
| 963 |
+
bbox = result.layout["regions"][0]["bbox"]
|
| 964 |
+
assert bbox == [400, 800, 2000, 1200] # 100*4, 200*4, 500*4, 300*4
|