Claude commited on
Commit
f338ad4
·
unverified ·
1 Parent(s): 1865b8a

feat(ingest): IIIF-native Sprint 2 — detect IIIF Image Service at ingestion

Browse files

Extract IIIF Image Service URL and canvas dimensions during manifest ingestion:

- New _extract_iiif_service(): detects IIIF Image Service from canvas data
with 4-level cascade: IIIF 3.0 service[] → IIIF 2.x service → URL pattern
detection (Gallica, etc.) → fallback None (static image).
- New _detect_iiif_service_from_url(): extracts service URL from IIIF Image
API URL pattern ({base}/full/{size}/{rot}/{qual}.{fmt}).
- _create_page() now accepts iiif_service_url, canvas_width, canvas_height,
manifest_url and stores them in PageModel.
- ingest_iiif_manifest: calls _extract_iiif_service for each canvas, stores
service URL + dimensions + manifest_url on each page.
- ingest_iiif_images: attempts URL pattern detection on direct image URLs.

14 new tests covering: IIIF 3.0/2.x service detection, dict vs list service,
Gallica URL pattern, trailing slash stripping, empty canvas, static fallback.

583 tests pass (+14 new), 0 regressions.

https://claude.ai/code/session_01UB4he7RdRPHLvNjky4X8Sw

backend/app/api/v1/ingest.py CHANGED
@@ -144,6 +144,10 @@ async def _create_page(
144
  folio_label: str,
145
  sequence: int,
146
  image_master_path: str | None = None,
 
 
 
 
147
  ) -> PageModel | None:
148
  """Crée une page si elle n'existe pas déjà. Retourne None si l'ID est déjà pris."""
149
  existing = await db.get(PageModel, page_id)
@@ -157,6 +161,10 @@ async def _create_page(
157
  folio_label=folio_label,
158
  sequence=sequence,
159
  image_master_path=image_master_path,
 
 
 
 
160
  processing_status="INGESTED",
161
  )
162
  db.add(page)
@@ -214,6 +222,78 @@ def _extract_canvas_label(canvas: dict, index: int) -> str:
214
  return f"f{index + 1:03d}r"
215
 
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  def _extract_canvas_image_url(canvas: dict) -> str | None:
218
  """Extrait l'URL de l'image principale d'un canvas IIIF (3.0 ou 2.x)."""
219
  # IIIF 3.0
@@ -385,9 +465,14 @@ async def ingest_iiif_manifest(
385
  folio_label = labels[i]
386
  page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
387
  image_url = _extract_canvas_image_url(canvas)
 
388
  page = await _create_page(
389
  db, ms.id, page_id, folio_label, seq + i,
390
  image_master_path=image_url,
 
 
 
 
391
  )
392
  if page is None:
393
  skipped += 1
@@ -406,7 +491,13 @@ async def ingest_iiif_manifest(
406
 
407
  logger.info(
408
  "Manifest IIIF ingéré",
409
- extra={"corpus_id": corpus_id, "url": body.manifest_url, "created": len(created), "skipped": skipped},
 
 
 
 
 
 
410
  )
411
  return IngestResponse(
412
  corpus_id=corpus_id,
@@ -446,9 +537,12 @@ async def ingest_iiif_images(
446
  skipped = 0
447
  for i, (url, folio_label) in enumerate(zip(body.urls, sanitized_labels)):
448
  page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
 
 
449
  page = await _create_page(
450
  db, ms.id, page_id, folio_label, seq + i,
451
  image_master_path=url,
 
452
  )
453
  if page is None:
454
  skipped += 1
 
144
  folio_label: str,
145
  sequence: int,
146
  image_master_path: str | None = None,
147
+ iiif_service_url: str | None = None,
148
+ canvas_width: int | None = None,
149
+ canvas_height: int | None = None,
150
+ manifest_url: str | None = None,
151
  ) -> PageModel | None:
152
  """Crée une page si elle n'existe pas déjà. Retourne None si l'ID est déjà pris."""
153
  existing = await db.get(PageModel, page_id)
 
161
  folio_label=folio_label,
162
  sequence=sequence,
163
  image_master_path=image_master_path,
164
+ iiif_service_url=iiif_service_url,
165
+ canvas_width=canvas_width,
166
+ canvas_height=canvas_height,
167
+ manifest_url=manifest_url,
168
  processing_status="INGESTED",
169
  )
170
  db.add(page)
 
222
  return f"f{index + 1:03d}r"
223
 
224
 
225
+ # Pattern IIIF Image API : {service}/full/{size}/{rotation}/{quality}.{format}
226
+ _IIIF_IMAGE_API_RE = re.compile(
227
+ r"^(https?://.+)/full/[^/]+/\d+/default\.\w+$"
228
+ )
229
+
230
+
231
+ def _extract_iiif_service(canvas: dict) -> tuple[str | None, int | None, int | None]:
232
+ """Détecte le IIIF Image Service d'un canvas et ses dimensions.
233
+
234
+ Retourne (service_url, canvas_width, canvas_height).
235
+ service_url est None si aucun service IIIF trouvé (image statique).
236
+ """
237
+ canvas_w = canvas.get("width")
238
+ canvas_h = canvas.get("height")
239
+
240
+ # ── IIIF 3.0 : body → service[] ─────────────────────────────────────
241
+ items = canvas.get("items") or []
242
+ if items:
243
+ ann_items = (items[0].get("items") or []) if items else []
244
+ if ann_items:
245
+ body = ann_items[0].get("body") or {}
246
+ if isinstance(body, dict):
247
+ # Chercher un service IIIF sur le body
248
+ services = body.get("service") or []
249
+ if isinstance(services, dict):
250
+ services = [services]
251
+ for svc in services:
252
+ svc_type = svc.get("type") or svc.get("@type") or ""
253
+ if "ImageService" in svc_type:
254
+ svc_url = (svc.get("id") or svc.get("@id") or "").rstrip("/")
255
+ if svc_url:
256
+ return svc_url, canvas_w, canvas_h
257
+
258
+ # Fallback : détecter le pattern Image API dans body.id
259
+ body_id = body.get("id") or body.get("@id") or ""
260
+ m = _IIIF_IMAGE_API_RE.match(body_id)
261
+ if m:
262
+ return m.group(1), canvas_w, canvas_h
263
+
264
+ # ── IIIF 2.x : resource → service ───────────────────────────────────
265
+ images = canvas.get("images") or []
266
+ if images:
267
+ resource = images[0].get("resource") or {}
268
+ services = resource.get("service") or []
269
+ if isinstance(services, dict):
270
+ services = [services]
271
+ for svc in services:
272
+ svc_type = svc.get("@type") or svc.get("type") or ""
273
+ if "ImageService" in svc_type:
274
+ svc_url = (svc.get("@id") or svc.get("id") or "").rstrip("/")
275
+ if svc_url:
276
+ return svc_url, canvas_w, canvas_h
277
+
278
+ # Fallback : pattern Image API dans resource @id
279
+ res_id = resource.get("@id") or resource.get("id") or ""
280
+ m = _IIIF_IMAGE_API_RE.match(res_id)
281
+ if m:
282
+ return m.group(1), canvas_w, canvas_h
283
+
284
+ return None, canvas_w, canvas_h
285
+
286
+
287
+ def _detect_iiif_service_from_url(url: str) -> str | None:
288
+ """Tente de détecter une URL de service IIIF à partir d'une URL d'image directe.
289
+
290
+ Si l'URL suit le pattern IIIF Image API ({base}/full/{size}/{rot}/{qual}.{fmt}),
291
+ retourne la base. Sinon retourne None.
292
+ """
293
+ m = _IIIF_IMAGE_API_RE.match(url)
294
+ return m.group(1) if m else None
295
+
296
+
297
  def _extract_canvas_image_url(canvas: dict) -> str | None:
298
  """Extrait l'URL de l'image principale d'un canvas IIIF (3.0 ou 2.x)."""
299
  # IIIF 3.0
 
465
  folio_label = labels[i]
466
  page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
467
  image_url = _extract_canvas_image_url(canvas)
468
+ service_url, c_width, c_height = _extract_iiif_service(canvas)
469
  page = await _create_page(
470
  db, ms.id, page_id, folio_label, seq + i,
471
  image_master_path=image_url,
472
+ iiif_service_url=service_url,
473
+ canvas_width=c_width,
474
+ canvas_height=c_height,
475
+ manifest_url=body.manifest_url,
476
  )
477
  if page is None:
478
  skipped += 1
 
491
 
492
  logger.info(
493
  "Manifest IIIF ingéré",
494
+ extra={
495
+ "corpus_id": corpus_id,
496
+ "url": body.manifest_url,
497
+ "created": len(created),
498
+ "skipped": skipped,
499
+ "iiif_service_detected": sum(1 for p in created if p.iiif_service_url),
500
+ },
501
  )
502
  return IngestResponse(
503
  corpus_id=corpus_id,
 
537
  skipped = 0
538
  for i, (url, folio_label) in enumerate(zip(body.urls, sanitized_labels)):
539
  page_id = _make_page_id(corpus.slug, folio_label, seq + i, dupes)
540
+ # Tenter de détecter un service IIIF à partir du pattern URL
541
+ service_url = _detect_iiif_service_from_url(url)
542
  page = await _create_page(
543
  db, ms.id, page_id, folio_label, seq + i,
544
  image_master_path=url,
545
+ iiif_service_url=service_url,
546
  )
547
  if page is None:
548
  skipped += 1
backend/tests/test_iiif_service_detection.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests de détection du IIIF Image Service à l'ingestion.
3
+
4
+ Vérifie :
5
+ - Extraction depuis un canvas IIIF 3.0 avec ImageService3
6
+ - Extraction depuis un canvas IIIF 2.x avec service @id
7
+ - Détection par pattern URL (Gallica, etc.)
8
+ - Fallback quand aucun service n'est trouvé
9
+ - Détection depuis URL directe (ingest/iiif-images)
10
+ """
11
+ import pytest
12
+
13
+ from app.api.v1.ingest import (
14
+ _detect_iiif_service_from_url,
15
+ _extract_iiif_service,
16
+ )
17
+
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # _extract_iiif_service — IIIF 3.0
21
+ # ---------------------------------------------------------------------------
22
+
23
+ def test_extract_iiif3_with_image_service3():
24
+ """Canvas IIIF 3.0 avec service ImageService3 explicite."""
25
+ canvas = {
26
+ "width": 3543,
27
+ "height": 4724,
28
+ "items": [{
29
+ "items": [{
30
+ "body": {
31
+ "id": "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29/full/max/0/default.jpg",
32
+ "type": "Image",
33
+ "service": [{
34
+ "id": "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29",
35
+ "type": "ImageService3",
36
+ "profile": "level2",
37
+ }],
38
+ },
39
+ }],
40
+ }],
41
+ }
42
+ svc_url, w, h = _extract_iiif_service(canvas)
43
+ assert svc_url == "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29"
44
+ assert w == 3543
45
+ assert h == 4724
46
+
47
+
48
+ def test_extract_iiif3_with_image_service2():
49
+ """Canvas IIIF 3.0 avec un service de type ImageService2."""
50
+ canvas = {
51
+ "width": 2000,
52
+ "height": 3000,
53
+ "items": [{
54
+ "items": [{
55
+ "body": {
56
+ "id": "https://example.com/image/1/full/max/0/default.jpg",
57
+ "type": "Image",
58
+ "service": [{
59
+ "id": "https://example.com/image/1",
60
+ "type": "ImageService2",
61
+ "profile": "level1",
62
+ }],
63
+ },
64
+ }],
65
+ }],
66
+ }
67
+ svc_url, w, h = _extract_iiif_service(canvas)
68
+ assert svc_url == "https://example.com/image/1"
69
+ assert w == 2000
70
+
71
+
72
+ def test_extract_iiif3_service_as_dict():
73
+ """Le champ service peut être un dict au lieu d'une liste."""
74
+ canvas = {
75
+ "width": 1000,
76
+ "height": 1500,
77
+ "items": [{
78
+ "items": [{
79
+ "body": {
80
+ "id": "https://example.com/img/full/max/0/default.jpg",
81
+ "service": {
82
+ "id": "https://example.com/img",
83
+ "type": "ImageService3",
84
+ },
85
+ },
86
+ }],
87
+ }],
88
+ }
89
+ svc_url, _, _ = _extract_iiif_service(canvas)
90
+ assert svc_url == "https://example.com/img"
91
+
92
+
93
+ def test_extract_iiif3_fallback_url_pattern():
94
+ """Sans service explicite, détecte le pattern Image API dans body.id."""
95
+ canvas = {
96
+ "width": 3000,
97
+ "height": 4000,
98
+ "items": [{
99
+ "items": [{
100
+ "body": {
101
+ "id": "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29/full/max/0/default.jpg",
102
+ "type": "Image",
103
+ # Pas de "service" !
104
+ },
105
+ }],
106
+ }],
107
+ }
108
+ svc_url, w, h = _extract_iiif_service(canvas)
109
+ assert svc_url == "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29"
110
+ assert w == 3000
111
+
112
+
113
+ def test_extract_iiif3_no_service_no_pattern():
114
+ """Canvas sans service et sans pattern Image API → None."""
115
+ canvas = {
116
+ "width": 800,
117
+ "height": 600,
118
+ "items": [{
119
+ "items": [{
120
+ "body": {
121
+ "id": "https://example.com/static/page1.jpg",
122
+ "type": "Image",
123
+ },
124
+ }],
125
+ }],
126
+ }
127
+ svc_url, w, h = _extract_iiif_service(canvas)
128
+ assert svc_url is None
129
+ assert w == 800
130
+ assert h == 600
131
+
132
+
133
+ # ---------------------------------------------------------------------------
134
+ # _extract_iiif_service — IIIF 2.x
135
+ # ---------------------------------------------------------------------------
136
+
137
+ def test_extract_iiif2_with_service():
138
+ """Canvas IIIF 2.x avec service dans resource."""
139
+ canvas = {
140
+ "width": 4000,
141
+ "height": 5000,
142
+ "images": [{
143
+ "resource": {
144
+ "@id": "https://example.com/image/2/full/full/0/default.jpg",
145
+ "service": {
146
+ "@id": "https://example.com/image/2",
147
+ "@type": "ImageService2",
148
+ },
149
+ },
150
+ }],
151
+ }
152
+ svc_url, w, h = _extract_iiif_service(canvas)
153
+ assert svc_url == "https://example.com/image/2"
154
+ assert w == 4000
155
+
156
+
157
+ def test_extract_iiif2_fallback_url_pattern():
158
+ """IIIF 2.x : détection par pattern dans resource @id."""
159
+ canvas = {
160
+ "width": 2500,
161
+ "height": 3500,
162
+ "images": [{
163
+ "resource": {
164
+ "@id": "https://iiif.bodleian.ox.ac.uk/image/abc123/full/full/0/default.jpg",
165
+ },
166
+ }],
167
+ }
168
+ svc_url, _, _ = _extract_iiif_service(canvas)
169
+ assert svc_url == "https://iiif.bodleian.ox.ac.uk/image/abc123"
170
+
171
+
172
+ def test_extract_iiif2_no_service():
173
+ """IIIF 2.x sans service et URL statique → None."""
174
+ canvas = {
175
+ "width": 1200,
176
+ "height": 1600,
177
+ "images": [{
178
+ "resource": {
179
+ "@id": "https://example.com/images/scan.png",
180
+ },
181
+ }],
182
+ }
183
+ svc_url, w, h = _extract_iiif_service(canvas)
184
+ assert svc_url is None
185
+ assert w == 1200
186
+
187
+
188
+ # ---------------------------------------------------------------------------
189
+ # _extract_iiif_service — cas limites
190
+ # ---------------------------------------------------------------------------
191
+
192
+ def test_extract_empty_canvas():
193
+ """Canvas vide → None sans crash."""
194
+ svc_url, w, h = _extract_iiif_service({})
195
+ assert svc_url is None
196
+ assert w is None
197
+ assert h is None
198
+
199
+
200
+ def test_extract_service_url_trailing_slash_stripped():
201
+ """L'URL du service ne doit pas se terminer par /."""
202
+ canvas = {
203
+ "width": 1000,
204
+ "height": 1000,
205
+ "items": [{
206
+ "items": [{
207
+ "body": {
208
+ "id": "https://example.com/img/full/max/0/default.jpg",
209
+ "service": [{
210
+ "id": "https://example.com/img/",
211
+ "type": "ImageService3",
212
+ }],
213
+ },
214
+ }],
215
+ }],
216
+ }
217
+ svc_url, _, _ = _extract_iiif_service(canvas)
218
+ assert svc_url == "https://example.com/img"
219
+ assert not svc_url.endswith("/")
220
+
221
+
222
+ # ---------------------------------------------------------------------------
223
+ # _detect_iiif_service_from_url — détection depuis URL directe
224
+ # ---------------------------------------------------------------------------
225
+
226
+ def test_detect_from_gallica_url():
227
+ """URL Gallica complète → service URL déduit."""
228
+ url = "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29/full/max/0/default.jpg"
229
+ assert _detect_iiif_service_from_url(url) == "https://gallica.bnf.fr/iiif/ark:/12148/btv1b8432314s/f29"
230
+
231
+
232
+ def test_detect_from_iiif_url_with_size():
233
+ """URL avec taille spécifique → service URL déduit."""
234
+ url = "https://example.com/iiif/img1/full/!1500,1500/0/default.jpg"
235
+ assert _detect_iiif_service_from_url(url) == "https://example.com/iiif/img1"
236
+
237
+
238
+ def test_detect_from_static_url_returns_none():
239
+ """URL statique (pas de pattern IIIF) → None."""
240
+ url = "https://example.com/images/page1.jpg"
241
+ assert _detect_iiif_service_from_url(url) is None
242
+
243
+
244
+ def test_detect_from_iiif_url_different_format():
245
+ """URL avec format PNG au lieu de JPEG."""
246
+ url = "https://example.com/iiif/img2/full/max/0/default.png"
247
+ assert _detect_iiif_service_from_url(url) == "https://example.com/iiif/img2"