IIIF-Studio / backend /tests /test_export_mets.py
Claude
rename: Scriptorium AI → IIIF Studio across entire codebase
e0fd571 unverified
"""
Tests du générateur METS v1.12 (Sprint 3 — Session B).
Vérifie :
- XML produit est valide (parseable par lxml)
- 6 sections obligatoires présentes : metsHdr, dmdSec, amdSec, fileSec,
structMap PHYSICAL, structMap LOGICAL
- fileSec : 3 fileGrp (master, derivative_web, alto), 3 fichiers par page
- structMap PHYSICAL : ordre respecte PageMaster.sequence (pas l'ordre de la liste)
- structMap LOGICAL : 1 seul div TYPE="manuscript"
- Chemins ALTO construits depuis corpus_slug + folio_label
- Métadonnées Dublin Core présentes dans dmdSec
- amdSec : techMD global avec model_id du premier master.processing
- manuscript_id, label, corpus_slug obligatoires → ValueError sinon
- Liste vide → ValueError explicite
- Scénarios réalistes : Beatus HR + BR (1 manuscrit), Grandes Chroniques (autre)
"""
# 1. stdlib
import json
from datetime import datetime, timezone
from pathlib import Path
# 2. third-party
import pytest
from lxml import etree
# 3. local
from app.schemas.page_master import EditorialInfo, EditorialStatus, OCRResult, PageMaster, ProcessingInfo
from app.services.export.mets import generate_mets, write_mets
# ── Namespaces ────────────────────────────────────────────────────────────────
_METS_NS = "http://www.loc.gov/METS/"
_DC_NS = "http://purl.org/dc/elements/1.1/"
_XLINK_NS = "http://www.w3.org/1999/xlink"
_NS = {"m": _METS_NS, "dc": _DC_NS, "xlink": _XLINK_NS}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _parse(xml_str: str) -> etree._Element:
return etree.fromstring(xml_str.encode("utf-8"))
def _xp(root: etree._Element, path: str) -> list:
return root.xpath(path, namespaces=_NS)
def _one(root: etree._Element, path: str) -> etree._Element:
results = _xp(root, path)
assert len(results) == 1, f"Expected 1 for {path!r}, got {len(results)}: {results}"
return results[0]
def _make_page(
page_id: str,
folio_label: str,
sequence: int,
original_url: str = "",
derivative_web: str = "",
with_processing: bool = False,
ocr_text: str = "",
) -> PageMaster:
processing = None
if with_processing:
processing = ProcessingInfo(
provider="google_ai_studio",
model_id="gemini-2.0-flash",
model_display_name="Gemini 2.0 Flash",
prompt_version="prompts/medieval-illuminated/primary_v1.txt",
raw_response_path=f"/data/corpora/test/pages/{folio_label}/ai_raw.json",
processed_at=datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc),
)
ocr = OCRResult(diplomatic_text=ocr_text, language="la", confidence=0.90) if ocr_text else None
return PageMaster(
page_id=page_id,
corpus_profile="medieval-illuminated",
manuscript_id="ms-test",
folio_label=folio_label,
sequence=sequence,
image={
"master": original_url or f"https://example.com/{folio_label}.jpg",
"derivative_web": derivative_web or f"/data/deriv/{folio_label}.jpg",
"thumbnail": f"/data/thumb/{folio_label}.jpg",
"width": 1500,
"height": 2000,
},
layout={"regions": []},
ocr=ocr,
processing=processing,
editorial=EditorialInfo(status=EditorialStatus.MACHINE_DRAFT),
)
def _base_meta(corpus_slug: str = "test-ms", label: str = "Test Manuscript") -> dict:
return {
"manuscript_id": "ms-test-001",
"label": label,
"corpus_slug": corpus_slug,
}
# ── Fixtures réalistes (3 PageMaster du Sprint 2) ───────────────────────────
@pytest.fixture
def beatus_pages():
"""2 pages du Beatus (HR + BR) — même manuscrit, 2 folios."""
return [
_make_page(
page_id="beatus-lat8878-hr-f233",
folio_label="f233-hr",
sequence=233,
original_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b52505441p/f233/full/full/0/native.jpg",
derivative_web="/data/corpora/beatus-lat8878/derivatives/f233-hr.jpg",
with_processing=True,
ocr_text="Incipit explanatio",
),
_make_page(
page_id="beatus-lat8878-br-f233",
folio_label="f233-br",
sequence=234,
original_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b52505441p/f233/full/600,/0/native.jpg",
derivative_web="/data/corpora/beatus-lat8878/derivatives/f233-br.jpg",
with_processing=False,
),
]
@pytest.fixture
def beatus_meta():
return {
"manuscript_id": "BnF-Latin-8878",
"label": "Beatus de Saint-Sever",
"corpus_slug": "beatus-lat8878",
"language": "la",
"repository": "Bibliothèque nationale de France",
"shelfmark": "Latin 8878",
"date_label": "XIe siècle",
"institution": "BnF",
}
@pytest.fixture
def chroniques_pages():
return [
_make_page(
page_id="chroniques-btv1b84472995-f16",
folio_label="f16",
sequence=16,
original_url="https://gallica.bnf.fr/iiif/ark:/12148/btv1b84472995/f16/full/full/0/native.jpg",
derivative_web="/data/corpora/grandes-chroniques/derivatives/f16.jpg",
with_processing=True,
ocr_text="Cy commence le prologue",
),
]
@pytest.fixture
def chroniques_meta():
return {
"manuscript_id": "BnF-btv1b84472995",
"label": "Grandes Chroniques de France",
"corpus_slug": "grandes-chroniques",
"language": "fr",
"repository": "Bibliothèque nationale de France",
}
# ---------------------------------------------------------------------------
# Tests — validité XML
# ---------------------------------------------------------------------------
def test_generate_mets_returns_string(beatus_pages, beatus_meta):
result = generate_mets(beatus_pages, beatus_meta)
assert isinstance(result, str)
def test_generate_mets_valid_xml(beatus_pages, beatus_meta):
xml_str = generate_mets(beatus_pages, beatus_meta)
root = _parse(xml_str)
assert root is not None
def test_generate_mets_xml_declaration(beatus_pages, beatus_meta):
xml_str = generate_mets(beatus_pages, beatus_meta)
assert xml_str.startswith("<?xml")
def test_generate_mets_namespace(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
assert root.tag == f"{{{_METS_NS}}}mets"
def test_generate_mets_objid(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
objid = root.get("OBJID")
assert objid is not None, "OBJID attribute absent du root mets"
assert objid == "BnF-Latin-8878"
def test_generate_mets_label(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
assert root.get("LABEL") == "Beatus de Saint-Sever"
def test_generate_mets_type_manuscript(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
assert root.get("TYPE") == "Manuscript"
# ---------------------------------------------------------------------------
# Tests — 6 sections obligatoires présentes
# ---------------------------------------------------------------------------
def test_mets_has_metsHdr(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
_one(root, "m:metsHdr")
def test_mets_has_dmdSec(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
_one(root, "m:dmdSec")
def test_mets_has_amdSec(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
_one(root, "m:amdSec")
def test_mets_has_fileSec(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
_one(root, "m:fileSec")
def test_mets_has_structMap_physical(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
_one(root, "m:structMap[@TYPE='PHYSICAL']")
def test_mets_has_structMap_logical(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
_one(root, "m:structMap[@TYPE='LOGICAL']")
# ---------------------------------------------------------------------------
# Tests — metsHdr
# ---------------------------------------------------------------------------
def test_metsHdr_agent_creator(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
agent = _one(root, "m:metsHdr/m:agent[@ROLE='CREATOR']")
name = _one(agent, "m:name")
assert name.text == "IIIF Studio"
def test_metsHdr_has_createdate(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
hdr = _one(root, "m:metsHdr")
assert hdr.get("CREATEDATE") is not None
# ---------------------------------------------------------------------------
# Tests — dmdSec (Dublin Core)
# ---------------------------------------------------------------------------
def test_dmdSec_has_DC_title(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
title = _one(root, "m:dmdSec//dc:title")
assert title.text == "Beatus de Saint-Sever"
def test_dmdSec_has_DC_identifier(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
ident = _one(root, "m:dmdSec//dc:identifier")
assert ident.text == "BnF-Latin-8878"
def test_dmdSec_has_DC_language(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
lang = _one(root, "m:dmdSec//dc:language")
assert lang.text == "la"
def test_dmdSec_optional_fields(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
# repository → dc:source
source = _one(root, "m:dmdSec//dc:source")
assert "nationale de France" in source.text
# date_label → dc:date
date = _one(root, "m:dmdSec//dc:date")
assert "XIe" in date.text
def test_dmdSec_no_optional_fields_when_absent():
pages = [_make_page("ms-0001r", "0001r", 1)]
meta = {"manuscript_id": "ms-001", "label": "Test", "corpus_slug": "test"}
root = _parse(generate_mets(pages, meta))
# aucun dc:language si non fourni
assert len(_xp(root, "m:dmdSec//dc:language")) == 0
def test_dmdSec_mdtype_is_dc(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
wrap = _one(root, "m:dmdSec/m:mdWrap")
assert wrap.get("MDTYPE") == "DC"
# ---------------------------------------------------------------------------
# Tests — amdSec (techMD global)
# ---------------------------------------------------------------------------
def test_amdSec_has_techMD(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
_one(root, "m:amdSec/m:techMD")
def test_amdSec_techMD_contains_model_id(beatus_pages, beatus_meta):
"""model_id du premier master.processing apparaît dans amdSec."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
# Cherche dans le contenu XML libre du techMD
amd_xml = etree.tostring(
_one(root, "m:amdSec"), encoding="unicode"
)
assert "gemini-2.0-flash" in amd_xml
def test_amdSec_techMD_page_count(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
amd_xml = etree.tostring(_one(root, "m:amdSec"), encoding="unicode")
assert "2" in amd_xml # 2 pages
def test_amdSec_techMD_no_processing_still_present():
"""amdSec doit être présent même sans master.processing."""
pages = [_make_page("ms-0001r", "0001r", 1, with_processing=False)]
meta = {"manuscript_id": "ms-001", "label": "Test", "corpus_slug": "test"}
root = _parse(generate_mets(pages, meta))
_one(root, "m:amdSec/m:techMD")
# ---------------------------------------------------------------------------
# Tests — fileSec (3 fileGrp, 3 fichiers par page)
# ---------------------------------------------------------------------------
def test_fileSec_has_three_fileGrp(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
grps = _xp(root, "m:fileSec/m:fileGrp")
assert len(grps) == 3
def test_fileSec_fileGrp_USE_values(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
uses = {el.get("USE") for el in _xp(root, "m:fileSec/m:fileGrp")}
assert uses == {"master", "derivative_web", "alto"}
def test_fileSec_master_fileGrp_count(beatus_pages, beatus_meta):
"""2 pages → 2 fichiers dans le fileGrp master."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
files = _xp(root, "m:fileSec/m:fileGrp[@USE='master']/m:file")
assert len(files) == 2
def test_fileSec_derivative_fileGrp_count(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
files = _xp(root, "m:fileSec/m:fileGrp[@USE='derivative_web']/m:file")
assert len(files) == 2
def test_fileSec_alto_fileGrp_count(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
files = _xp(root, "m:fileSec/m:fileGrp[@USE='alto']/m:file")
assert len(files) == 2
def test_fileSec_master_url_is_original_url(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
flocats = _xp(root, "m:fileSec/m:fileGrp[@USE='master']/m:file/m:FLocat")
hrefs = {el.get(f"{{{_XLINK_NS}}}href") for el in flocats}
assert "https://gallica.bnf.fr/iiif/ark:/12148/btv1b52505441p/f233/full/full/0/native.jpg" in hrefs
assert "https://gallica.bnf.fr/iiif/ark:/12148/btv1b52505441p/f233/full/600,/0/native.jpg" in hrefs
def test_fileSec_master_loctype_url(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
flocats = _xp(root, "m:fileSec/m:fileGrp[@USE='master']/m:file/m:FLocat")
for fl in flocats:
assert fl.get("LOCTYPE") == "URL"
def test_fileSec_alto_path_contains_corpus_slug(beatus_pages, beatus_meta):
"""Le chemin ALTO référence le bon corpus_slug."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
flocats = _xp(root, "m:fileSec/m:fileGrp[@USE='alto']/m:file/m:FLocat")
for fl in flocats:
href = fl.get(f"{{{_XLINK_NS}}}href", "")
assert "beatus-lat8878" in href
def test_fileSec_alto_path_contains_folio_label(beatus_pages, beatus_meta):
"""Le chemin ALTO référence le bon folio_label."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
flocats = _xp(root, "m:fileSec/m:fileGrp[@USE='alto']/m:file/m:FLocat")
hrefs = [fl.get(f"{{{_XLINK_NS}}}href", "") for fl in flocats]
assert any("f233-hr" in h for h in hrefs)
assert any("f233-br" in h for h in hrefs)
def test_fileSec_alto_path_ends_with_alto_xml(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
flocats = _xp(root, "m:fileSec/m:fileGrp[@USE='alto']/m:file/m:FLocat")
for fl in flocats:
href = fl.get(f"{{{_XLINK_NS}}}href", "")
assert href.endswith("alto.xml"), f"ALTO href ne finit pas par alto.xml : {href}"
def test_fileSec_alto_mimetype(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
files = _xp(root, "m:fileSec/m:fileGrp[@USE='alto']/m:file")
for f in files:
assert f.get("MIMETYPE") == "text/xml"
def test_fileSec_file_ids_are_unique(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
all_files = _xp(root, "m:fileSec//m:file")
ids = [f.get("ID") for f in all_files]
assert len(ids) == len(set(ids)), f"IDs de fichiers en double : {ids}"
# ---------------------------------------------------------------------------
# Tests — structMap PHYSICAL (séquence)
# ---------------------------------------------------------------------------
def test_structMap_physical_div_count(beatus_pages, beatus_meta):
"""2 pages → 2 div TYPE=page dans la structMap physique."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
divs = _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
assert len(divs) == 2
def test_structMap_physical_order_respects_sequence():
"""Les pages sont triées par sequence même si la liste est dans le désordre."""
pages = [
_make_page("ms-f003r", "f003r", sequence=3),
_make_page("ms-f001r", "f001r", sequence=1),
_make_page("ms-f002r", "f002r", sequence=2),
]
meta = {"manuscript_id": "ms-001", "label": "Test", "corpus_slug": "test"}
root = _parse(generate_mets(pages, meta))
divs = _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
orders = [int(d.get("ORDER")) for d in divs]
assert orders == [1, 2, 3]
def test_structMap_physical_order_values(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
divs = _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
orders = sorted(int(d.get("ORDER")) for d in divs)
assert orders == [233, 234]
def test_structMap_physical_orderlabel(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
divs = _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
labels = {d.get("ORDERLABEL") for d in divs}
assert "f233-hr" in labels
assert "f233-br" in labels
def test_structMap_physical_each_page_has_3_fptr(beatus_pages, beatus_meta):
"""Chaque div page a exactement 3 fptr (master + deriv + alto)."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
divs = _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
for div in divs:
fptrs = div.findall(f"{{{_METS_NS}}}fptr")
assert len(fptrs) == 3, (
f"Div {div.get('ORDERLABEL')} a {len(fptrs)} fptr, attendu 3"
)
def test_structMap_physical_fptr_fileids_exist_in_fileSec(beatus_pages, beatus_meta):
"""Les FILEID des fptr existent bien dans la fileSec."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
file_ids = {f.get("ID") for f in _xp(root, "m:fileSec//m:file")}
fptr_ids = {f.get("FILEID") for f in _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:fptr")}
assert fptr_ids.issubset(file_ids), f"FILEID manquants : {fptr_ids - file_ids}"
def test_structMap_physical_fptr_references_correct_files():
"""Les FILEID référencent les fichiers correspondant à la bonne page."""
page = _make_page("my-ms-0042v", "0042v", sequence=42)
meta = {"manuscript_id": "ms-x", "label": "X", "corpus_slug": "corpus-x"}
root = _parse(generate_mets([page], meta))
div = _one(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
fptr_ids = {f.get("FILEID") for f in div.findall(f"{{{_METS_NS}}}fptr")}
# Les IDs doivent contenir la version safe du page_id
safe = "my_ms_0042v"
assert f"IMG_MASTER_{safe}" in fptr_ids
assert f"IMG_DERIV_{safe}" in fptr_ids
assert f"ALTO_{safe}" in fptr_ids
# ---------------------------------------------------------------------------
# Tests — structMap LOGICAL
# ---------------------------------------------------------------------------
def test_structMap_logical_has_one_manuscript_div(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
divs = _xp(root, "m:structMap[@TYPE='LOGICAL']/m:div[@TYPE='manuscript']")
assert len(divs) == 1
def test_structMap_logical_div_label(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
div = _one(root, "m:structMap[@TYPE='LOGICAL']/m:div[@TYPE='manuscript']")
assert div.get("LABEL") == "Beatus de Saint-Sever"
def test_structMap_logical_div_dmdid(beatus_pages, beatus_meta):
root = _parse(generate_mets(beatus_pages, beatus_meta))
div = _one(root, "m:structMap[@TYPE='LOGICAL']/m:div[@TYPE='manuscript']")
assert div.get("DMDID") == "DMD_1"
def test_structMap_logical_no_page_divs(beatus_pages, beatus_meta):
"""MVP : aucun sous-div de page dans la structMap logique."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
page_divs = _xp(root, "m:structMap[@TYPE='LOGICAL']//m:div[@TYPE='page']")
assert len(page_divs) == 0
# ---------------------------------------------------------------------------
# Tests — erreurs explicites
# ---------------------------------------------------------------------------
def test_empty_masters_raises_value_error():
meta = {"manuscript_id": "ms-x", "label": "X", "corpus_slug": "x"}
with pytest.raises(ValueError, match="vide"):
generate_mets([], meta)
def test_missing_manuscript_id_raises():
pages = [_make_page("ms-0001r", "0001r", 1)]
with pytest.raises(ValueError, match="manuscript_id"):
generate_mets(pages, {"label": "X", "corpus_slug": "x"})
def test_missing_label_raises():
pages = [_make_page("ms-0001r", "0001r", 1)]
with pytest.raises(ValueError, match="label"):
generate_mets(pages, {"manuscript_id": "ms-x", "corpus_slug": "x"})
def test_missing_corpus_slug_raises():
pages = [_make_page("ms-0001r", "0001r", 1)]
with pytest.raises(ValueError, match="corpus_slug"):
generate_mets(pages, {"manuscript_id": "ms-x", "label": "X"})
def test_empty_string_corpus_slug_raises():
pages = [_make_page("ms-0001r", "0001r", 1)]
with pytest.raises(ValueError, match="corpus_slug"):
generate_mets(pages, {"manuscript_id": "ms-x", "label": "X", "corpus_slug": ""})
# ---------------------------------------------------------------------------
# Tests — write_mets
# ---------------------------------------------------------------------------
def test_write_mets_creates_file(tmp_path, beatus_pages, beatus_meta):
xml_str = generate_mets(beatus_pages, beatus_meta)
write_mets(xml_str, "beatus-lat8878", base_data_dir=tmp_path)
expected = tmp_path / "corpora" / "beatus-lat8878" / "mets.xml"
assert expected.exists()
def test_write_mets_creates_parent_dirs(tmp_path, beatus_pages, beatus_meta):
xml_str = generate_mets(beatus_pages, beatus_meta)
write_mets(xml_str, "beatus-lat8878", base_data_dir=tmp_path)
assert (tmp_path / "corpora" / "beatus-lat8878").is_dir()
def test_write_mets_content_is_valid_xml(tmp_path, beatus_pages, beatus_meta):
xml_str = generate_mets(beatus_pages, beatus_meta)
write_mets(xml_str, "beatus-lat8878", base_data_dir=tmp_path)
path = tmp_path / "corpora" / "beatus-lat8878" / "mets.xml"
root = etree.parse(str(path)).getroot()
assert root.tag == f"{{{_METS_NS}}}mets"
# ---------------------------------------------------------------------------
# Tests — scénarios réalistes Sprint 2
# ---------------------------------------------------------------------------
def test_beatus_full_mets(beatus_pages, beatus_meta):
"""Scénario complet Beatus HR + BR : 2 pages, métadonnées complètes."""
root = _parse(generate_mets(beatus_pages, beatus_meta))
# Identité
assert root.get("OBJID") == "BnF-Latin-8878"
# 2 pages dans fileSec
assert len(_xp(root, "m:fileSec/m:fileGrp[@USE='master']/m:file")) == 2
# Séquence physique
orders = sorted(
int(d.get("ORDER"))
for d in _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
)
assert orders == [233, 234]
# URLs Gallica présentes
hrefs = [
el.get(f"{{{_XLINK_NS}}}href", "")
for el in _xp(root, "m:fileSec/m:fileGrp[@USE='master']/m:file/m:FLocat")
]
assert any("btv1b52505441p" in h for h in hrefs)
# shelfmark dans DC
assert len(_xp(root, "m:dmdSec//dc:relation")) == 1
def test_grandes_chroniques_mets(chroniques_pages, chroniques_meta):
"""Scénario Grandes Chroniques : 1 page, langue fr."""
root = _parse(generate_mets(chroniques_pages, chroniques_meta))
assert root.get("OBJID") == "BnF-btv1b84472995"
lang = _one(root, "m:dmdSec//dc:language")
assert lang.text == "fr"
# 1 fichier par fileGrp
for use in ("master", "derivative_web", "alto"):
files = _xp(root, f"m:fileSec/m:fileGrp[@USE='{use}']/m:file")
assert len(files) == 1, f"fileGrp[@USE='{use}'] : attendu 1, obtenu {len(files)}"
# ALTO path contient grandes-chroniques + f16
flocats = _xp(root, "m:fileSec/m:fileGrp[@USE='alto']/m:file/m:FLocat")
href = flocats[0].get(f"{{{_XLINK_NS}}}href", "")
assert "grandes-chroniques" in href
assert "f16" in href
def test_single_page_manuscript():
"""Un manuscrit à 1 seule page est valide."""
pages = [_make_page("ms-0001r", "0001r", 1)]
meta = {"manuscript_id": "ms-solo", "label": "Solo MS", "corpus_slug": "solo"}
root = _parse(generate_mets(pages, meta))
divs = _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
assert len(divs) == 1
def test_many_pages_sequence_order():
"""10 pages dans le désordre → structMap PHYSICAL dans l'ordre croissant."""
import random
pages = [_make_page(f"ms-f{i:03d}r", f"f{i:03d}r", i) for i in range(1, 11)]
random.shuffle(pages)
meta = {"manuscript_id": "ms-big", "label": "Big MS", "corpus_slug": "big-ms"}
root = _parse(generate_mets(pages, meta))
divs = _xp(root, "m:structMap[@TYPE='PHYSICAL']//m:div[@TYPE='page']")
orders = [int(d.get("ORDER")) for d in divs]
assert orders == list(range(1, 11))
def test_alto_path_uses_base_data_dir(tmp_path):
"""Le chemin ALTO dans fileSec utilise bien le base_data_dir passé."""
pages = [_make_page("ms-0001r", "0001r", 1)]
meta = {"manuscript_id": "ms-x", "label": "X", "corpus_slug": "corp-x"}
root = _parse(generate_mets(pages, meta, base_data_dir=tmp_path))
fl = _one(root, "m:fileSec/m:fileGrp[@USE='alto']/m:file/m:FLocat")
href = fl.get(f"{{{_XLINK_NS}}}href", "")
assert str(tmp_path) in href