Claude commited on
Commit
9097545
·
unverified ·
1 Parent(s): 4c0517f

fix: comprehensive repo audit — 15 issues fixed

Browse files

Critical:
- Fix path mismatch between master.json write (folio_label) and read (page.id)
in pages.py, export.py — API returned 404 after pipeline analysis
- Fix path traversal using is_relative_to() instead of string comparison
in job_runner.py and main.py

Search & performance:
- Search now uses SQL LIKE on pre-normalized column instead of loading all
rows into Python (O(n) → SQL-filtered)
- Add normalized_text column + migration in main.py

Frontend:
- get() helper now extracts FastAPI error detail like post/put/del
- getCorpusModel() uses get() wrapper instead of raw fetch
- Extract duplicate STATUS_LABELS/VARIANTS into shared editorial.ts
- Add 'claimed' to JobStatus type

Backend cleanup:
- Remove dead iiif_base field from ImageInfo schema
- Remove dead ParseError import from analyzer.py
- Narrow catch-all Exception to specific types in response_parser.py
- Fix misleading return type hints in models_api.py endpoints
- Make AI concurrency configurable via AI_MAX_CONCURRENT env var
- Add limit/offset to history endpoint

CI/CD:
- Add frontend vitest to GitHub Actions workflow
- Replace hardcoded HuggingFace username with configurable vars

https://claude.ai/code/session_01NuG9pRMcEHDi4SsKtEHoCj

.github/workflows/deploy-hf.yml CHANGED
@@ -32,8 +32,10 @@ jobs:
32
  - name: Push to HuggingFace Space
33
  env:
34
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
 
35
  run: |
36
  git config user.email "github-actions[bot]@users.noreply.github.com"
37
  git config user.name "github-actions[bot]"
38
- git remote add hf https://Ma-Ri-Ba-Ku:${HF_TOKEN}@huggingface.co/spaces/Ma-Ri-Ba-Ku/IIIF-Studio
39
  git push hf main --force
 
32
  - name: Push to HuggingFace Space
33
  env:
34
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
35
+ HF_USERNAME: ${{ vars.HF_USERNAME || 'Ma-Ri-Ba-Ku' }}
36
+ HF_SPACE: ${{ vars.HF_SPACE || 'IIIF-Studio' }}
37
  run: |
38
  git config user.email "github-actions[bot]@users.noreply.github.com"
39
  git config user.name "github-actions[bot]"
40
+ git remote add hf "https://${HF_USERNAME}:${HF_TOKEN}@huggingface.co/spaces/${HF_USERNAME}/${HF_SPACE}"
41
  git push hf main --force
.github/workflows/tests.yml CHANGED
@@ -31,5 +31,18 @@ jobs:
31
  - name: Install backend with dev dependencies
32
  run: pip install -e "backend/[dev]"
33
 
34
- - name: Run tests
35
  run: pytest backend/tests/ --tb=short -q
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  - name: Install backend with dev dependencies
32
  run: pip install -e "backend/[dev]"
33
 
34
+ - name: Run backend tests
35
  run: pytest backend/tests/ --tb=short -q
36
+
37
+ - name: Setup Node.js
38
+ uses: actions/setup-node@v4
39
+ with:
40
+ node-version: "20"
41
+ cache: "npm"
42
+ cache-dependency-path: "frontend/package-lock.json"
43
+
44
+ - name: Install frontend dependencies
45
+ run: cd frontend && npm ci
46
+
47
+ - name: Run frontend tests
48
+ run: cd frontend && npm test
backend/app/api/v1/export.py CHANGED
@@ -67,7 +67,7 @@ async def _load_manuscript_with_masters(
67
 
68
  masters: list[PageMaster] = []
69
  for page in pages:
70
- master = await _read_master_json(corpus.slug, page.id)
71
  if master is not None:
72
  masters.append(master)
73
 
@@ -80,14 +80,14 @@ async def _load_manuscript_with_masters(
80
  return manuscript, corpus, masters
81
 
82
 
83
- def _read_master_json_sync(corpus_slug: str, page_id: str) -> PageMaster | None:
84
  """Lit le master.json d'une page depuis data/. Retourne None si absent (bloquant)."""
85
  path = (
86
  _config_module.settings.data_dir
87
  / "corpora"
88
  / corpus_slug
89
  / "pages"
90
- / page_id
91
  / "master.json"
92
  )
93
  if not path.exists():
@@ -96,9 +96,9 @@ def _read_master_json_sync(corpus_slug: str, page_id: str) -> PageMaster | None:
96
  return PageMaster.model_validate(raw)
97
 
98
 
99
- async def _read_master_json(corpus_slug: str, page_id: str) -> PageMaster | None:
100
  """Version async — délègue la lecture au threadpool."""
101
- return await asyncio.to_thread(_read_master_json_sync, corpus_slug, page_id)
102
 
103
 
104
  def _build_manuscript_meta(
@@ -165,7 +165,7 @@ async def get_alto(page_id: str, db: AsyncSession = Depends(get_db)) -> Response
165
  if corpus is None:
166
  raise HTTPException(status_code=404, detail="Corpus introuvable")
167
 
168
- master = await _read_master_json(corpus.slug, page_id)
169
  if master is None:
170
  raise HTTPException(
171
  status_code=404,
 
67
 
68
  masters: list[PageMaster] = []
69
  for page in pages:
70
+ master = await _read_master_json(corpus.slug, page.folio_label)
71
  if master is not None:
72
  masters.append(master)
73
 
 
80
  return manuscript, corpus, masters
81
 
82
 
83
+ def _read_master_json_sync(corpus_slug: str, folio_label: str) -> PageMaster | None:
84
  """Lit le master.json d'une page depuis data/. Retourne None si absent (bloquant)."""
85
  path = (
86
  _config_module.settings.data_dir
87
  / "corpora"
88
  / corpus_slug
89
  / "pages"
90
+ / folio_label
91
  / "master.json"
92
  )
93
  if not path.exists():
 
96
  return PageMaster.model_validate(raw)
97
 
98
 
99
+ async def _read_master_json(corpus_slug: str, folio_label: str) -> PageMaster | None:
100
  """Version async — délègue la lecture au threadpool."""
101
+ return await asyncio.to_thread(_read_master_json_sync, corpus_slug, folio_label)
102
 
103
 
104
  def _build_manuscript_meta(
 
165
  if corpus is None:
166
  raise HTTPException(status_code=404, detail="Corpus introuvable")
167
 
168
+ master = await _read_master_json(corpus.slug, page.folio_label)
169
  if master is None:
170
  raise HTTPException(
171
  status_code=404,
backend/app/api/v1/models_api.py CHANGED
@@ -120,7 +120,7 @@ async def set_corpus_model(
120
  corpus_id: str,
121
  body: ModelSelectRequest,
122
  db: AsyncSession = Depends(get_db),
123
- ) -> ModelConfigDB:
124
  """Associe un modèle IA à un corpus. Crée ou met à jour la configuration."""
125
  corpus = await db.get(CorpusModel, corpus_id)
126
  if corpus is None:
@@ -154,7 +154,7 @@ async def set_corpus_model(
154
  @router.get("/corpora/{corpus_id}/model", response_model=ModelConfigResponse)
155
  async def get_corpus_model(
156
  corpus_id: str, db: AsyncSession = Depends(get_db)
157
- ) -> ModelConfigDB:
158
  """Retourne la configuration du modèle IA actif pour un corpus."""
159
  corpus = await db.get(CorpusModel, corpus_id)
160
  if corpus is None:
 
120
  corpus_id: str,
121
  body: ModelSelectRequest,
122
  db: AsyncSession = Depends(get_db),
123
+ ) -> ModelConfigResponse:
124
  """Associe un modèle IA à un corpus. Crée ou met à jour la configuration."""
125
  corpus = await db.get(CorpusModel, corpus_id)
126
  if corpus is None:
 
154
  @router.get("/corpora/{corpus_id}/model", response_model=ModelConfigResponse)
155
  async def get_corpus_model(
156
  corpus_id: str, db: AsyncSession = Depends(get_db)
157
+ ) -> ModelConfigResponse:
158
  """Retourne la configuration du modèle IA actif pour un corpus."""
159
  corpus = await db.get(CorpusModel, corpus_id)
160
  if corpus is None:
backend/app/api/v1/pages.py CHANGED
@@ -100,7 +100,7 @@ async def _load_master(
100
  / "corpora"
101
  / corpus.slug
102
  / "pages"
103
- / page.id
104
  / "master.json"
105
  )
106
  if not master_path.exists():
@@ -125,7 +125,7 @@ async def _get_page_dir(page: PageModel, db: AsyncSession) -> Path | None:
125
  / "corpora"
126
  / corpus.slug
127
  / "pages"
128
- / page.id
129
  )
130
 
131
 
@@ -388,6 +388,8 @@ async def apply_corrections(
388
  async def get_page_history(
389
  page_id: str,
390
  db: AsyncSession = Depends(get_db),
 
 
391
  ) -> list[VersionInfo]:
392
  """Liste les versions archivées du master.json (master_v*.json).
393
 
@@ -423,4 +425,5 @@ async def get_page_history(
423
  )
424
  continue
425
 
426
- return sorted(versions, key=lambda v: v.version)
 
 
100
  / "corpora"
101
  / corpus.slug
102
  / "pages"
103
+ / page.folio_label
104
  / "master.json"
105
  )
106
  if not master_path.exists():
 
125
  / "corpora"
126
  / corpus.slug
127
  / "pages"
128
+ / page.folio_label
129
  )
130
 
131
 
 
388
  async def get_page_history(
389
  page_id: str,
390
  db: AsyncSession = Depends(get_db),
391
+ limit: int = 100,
392
+ offset: int = 0,
393
  ) -> list[VersionInfo]:
394
  """Liste les versions archivées du master.json (master_v*.json).
395
 
 
425
  )
426
  continue
427
 
428
+ versions.sort(key=lambda v: v.version)
429
+ return versions[offset:offset + limit]
backend/app/config.py CHANGED
@@ -44,6 +44,9 @@ class Settings(BaseSettings):
44
  # ── Base de données ───────────────────────────────────────────────────────
45
  database_url: str = "sqlite+aiosqlite:///./iiif_studio.db"
46
 
 
 
 
47
  # ── Fournisseurs IA (R06 — clés depuis l'environnement uniquement) ────────
48
  # Chaque clé est optionnelle. Le backend détecte automatiquement quels
49
  # providers sont disponibles selon les clés présentes. Pas de AI_PROVIDER
 
44
  # ── Base de données ───────────────────────────────────────────────────────
45
  database_url: str = "sqlite+aiosqlite:///./iiif_studio.db"
46
 
47
+ # ── Pipeline IA ────────────────────────────────────────────────────────────
48
+ ai_max_concurrent: int = 3 # jobs IA simultanés par corpus run
49
+
50
  # ── Fournisseurs IA (R06 — clés depuis l'environnement uniquement) ────────
51
  # Chaque clé est optionnelle. Le backend détecte automatiquement quels
52
  # providers sont disponibles selon les clés présentes. Pas de AI_PROVIDER
backend/app/main.py CHANGED
@@ -36,6 +36,21 @@ def _migrate_model_configs(connection) -> None:
36
  logger.info("Migration : colonne supports_vision ajoutée à model_configs")
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  @asynccontextmanager
40
  async def lifespan(application: FastAPI):
41
  """Crée les tables SQLite au démarrage, libère l'engine à l'arrêt."""
@@ -65,9 +80,9 @@ async def lifespan(application: FastAPI):
65
 
66
  async with engine.begin() as conn:
67
  await conn.run_sync(Base.metadata.create_all)
68
- # Migration : ajouter supports_vision aux model_configs existantes
69
- # (create_all ne fait pas d'ALTER TABLE sur les tables existantes)
70
  await conn.run_sync(_migrate_model_configs)
 
71
  logger.info("Tables SQLite initialisées")
72
  yield
73
  await engine.dispose()
@@ -117,7 +132,7 @@ async def serve_frontend(full_path: str) -> FileResponse | RedirectResponse:
117
  if _STATIC_DIR.is_dir():
118
  candidate = (_STATIC_DIR / full_path).resolve()
119
  # Empêcher le path traversal : le fichier résolu doit être sous _STATIC_DIR
120
- if candidate.is_file() and str(candidate).startswith(str(_STATIC_DIR.resolve()) + "/"):
121
  return FileResponse(candidate)
122
  index = _STATIC_DIR / "index.html"
123
  if index.exists():
 
36
  logger.info("Migration : colonne supports_vision ajoutée à model_configs")
37
 
38
 
39
+ def _migrate_page_search(connection) -> None:
40
+ """Ajoute la colonne normalized_text si absente (recherche SQL LIKE)."""
41
+ from sqlalchemy import inspect, text
42
+
43
+ inspector = inspect(connection)
44
+ if "page_search" not in inspector.get_table_names():
45
+ return
46
+ columns = {c["name"] for c in inspector.get_columns("page_search")}
47
+ if "normalized_text" not in columns:
48
+ connection.execute(
49
+ text("ALTER TABLE page_search ADD COLUMN normalized_text TEXT NOT NULL DEFAULT ''")
50
+ )
51
+ logger.info("Migration : colonne normalized_text ajoutée à page_search")
52
+
53
+
54
  @asynccontextmanager
55
  async def lifespan(application: FastAPI):
56
  """Crée les tables SQLite au démarrage, libère l'engine à l'arrêt."""
 
80
 
81
  async with engine.begin() as conn:
82
  await conn.run_sync(Base.metadata.create_all)
83
+ # Migrations : create_all ne fait pas d'ALTER TABLE sur les tables existantes
 
84
  await conn.run_sync(_migrate_model_configs)
85
+ await conn.run_sync(_migrate_page_search)
86
  logger.info("Tables SQLite initialisées")
87
  yield
88
  await engine.dispose()
 
132
  if _STATIC_DIR.is_dir():
133
  candidate = (_STATIC_DIR / full_path).resolve()
134
  # Empêcher le path traversal : le fichier résolu doit être sous _STATIC_DIR
135
+ if candidate.is_file() and candidate.is_relative_to(_STATIC_DIR.resolve()):
136
  return FileResponse(candidate)
137
  index = _STATIC_DIR / "index.html"
138
  if index.exists():
backend/app/models/page_search.py CHANGED
@@ -22,3 +22,5 @@ class PageSearchIndex(Base):
22
  diplomatic_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
23
  translation_fr: Mapped[str] = mapped_column(Text, nullable=False, default="")
24
  tags: Mapped[str] = mapped_column(Text, nullable=False, default="")
 
 
 
22
  diplomatic_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
23
  translation_fr: Mapped[str] = mapped_column(Text, nullable=False, default="")
24
  tags: Mapped[str] = mapped_column(Text, nullable=False, default="")
25
+ # Pre-normalized concatenation of all text fields for SQL LIKE search
26
+ normalized_text: Mapped[str] = mapped_column(Text, nullable=False, default="")
backend/app/schemas/page_master.py CHANGED
@@ -50,7 +50,6 @@ class ImageInfo(BaseModel):
50
  master: str # URL source (service IIIF ou statique) ou chemin local
51
  derivative_web: str | None = None # chemin dérivé 1500px (legacy/upload)
52
  thumbnail: str | None = None # chemin thumbnail 256px (legacy/upload)
53
- iiif_base: str | None = None # compat arrière
54
  iiif_service_url: str | None = None # URL du IIIF Image Service (zoom tuilé)
55
  manifest_url: str | None = None # URL du manifest source (provenance)
56
  width: int # largeur du canvas original
 
50
  master: str # URL source (service IIIF ou statique) ou chemin local
51
  derivative_web: str | None = None # chemin dérivé 1500px (legacy/upload)
52
  thumbnail: str | None = None # chemin thumbnail 256px (legacy/upload)
 
53
  iiif_service_url: str | None = None # URL du IIIF Image Service (zoom tuilé)
54
  manifest_url: str | None = None # URL du manifest source (provenance)
55
  width: int # largeur du canvas original
backend/app/services/ai/analyzer.py CHANGED
@@ -17,7 +17,7 @@ from app.schemas.page_master import EditorialInfo, EditorialStatus, ImageInfo, P
17
  from app.services.ai.master_writer import write_ai_raw, write_master_json
18
  from app.services.ai.model_registry import get_provider
19
  from app.services.ai.prompt_loader import load_and_render_prompt
20
- from app.services.ai.response_parser import ParseError, parse_ai_response # noqa: F401
21
 
22
  logger = logging.getLogger(__name__)
23
 
 
17
  from app.services.ai.master_writer import write_ai_raw, write_master_json
18
  from app.services.ai.model_registry import get_provider
19
  from app.services.ai.prompt_loader import load_and_render_prompt
20
+ from app.services.ai.response_parser import parse_ai_response
21
 
22
  logger = logging.getLogger(__name__)
23
 
backend/app/services/ai/response_parser.py CHANGED
@@ -152,7 +152,7 @@ def parse_ai_response(raw_text: str) -> tuple[dict, OCRResult]:
152
  try:
153
  region = Region.model_validate(raw_region)
154
  valid_regions.append(region.model_dump())
155
- except (ValidationError, Exception) as exc:
156
  logger.warning(
157
  "Région ignorée — bbox ou champ invalide",
158
  extra={"index": i, "region": raw_region, "error": str(exc)},
 
152
  try:
153
  region = Region.model_validate(raw_region)
154
  valid_regions.append(region.model_dump())
155
+ except (ValidationError, ValueError, KeyError, TypeError) as exc:
156
  logger.warning(
157
  "Région ignorée — bbox ou champ invalide",
158
  extra={"index": i, "region": raw_region, "error": str(exc)},
backend/app/services/corpus_runner.py CHANGED
@@ -61,10 +61,9 @@ async def execute_corpus_job(corpus_id: str) -> dict:
61
 
62
  # Exécution concurrente avec semaphore — chaque job gère sa propre session
63
  from app.services.job_runner import execute_page_job
 
64
 
65
- _MAX_CONCURRENT = 3 # limiter la pression sur les APIs IA
66
-
67
- sem = asyncio.Semaphore(_MAX_CONCURRENT)
68
 
69
  async def _run_one(jid: str) -> None:
70
  async with sem:
 
61
 
62
  # Exécution concurrente avec semaphore — chaque job gère sa propre session
63
  from app.services.job_runner import execute_page_job
64
+ from app.config import settings
65
 
66
+ sem = asyncio.Semaphore(settings.ai_max_concurrent)
 
 
67
 
68
  async def _run_one(jid: str) -> None:
69
  async with sem:
backend/app/services/job_runner.py CHANGED
@@ -196,7 +196,7 @@ async def _run_job_impl(job_id: str, db: AsyncSession) -> None:
196
  # ── Mode fichier local (upload) ──────────────────────────────────
197
  source_path = Path(image_source).resolve()
198
  data_dir_resolved = data_dir.resolve()
199
- if not str(source_path).startswith(str(data_dir_resolved) + "/") and source_path != data_dir_resolved:
200
  raise ValueError(
201
  f"Chemin image hors du répertoire de données interdit : "
202
  f"{image_source!r} (résolu : {source_path})"
 
196
  # ── Mode fichier local (upload) ──────────────────────────────────
197
  source_path = Path(image_source).resolve()
198
  data_dir_resolved = data_dir.resolve()
199
+ if not source_path.is_relative_to(data_dir_resolved):
200
  raise ValueError(
201
  f"Chemin image hors du répertoire de données interdit : "
202
  f"{image_source!r} (résolu : {source_path})"
backend/app/services/search/indexer.py CHANGED
@@ -1,5 +1,9 @@
1
  """
2
- Service d'indexation et de recherche FTS5 pour les pages analysées.
 
 
 
 
3
  """
4
  import logging
5
  import unicodedata
@@ -32,6 +36,11 @@ def _extract_tags(master: PageMaster) -> str:
32
  return " ".join(tags)
33
 
34
 
 
 
 
 
 
35
  async def index_page(db: AsyncSession, master: PageMaster) -> None:
36
  """Indexe ou met a jour une page dans la table de recherche."""
37
  existing = await db.get(PageSearchIndex, master.page_id)
@@ -39,6 +48,7 @@ async def index_page(db: AsyncSession, master: PageMaster) -> None:
39
  diplomatic = (master.ocr.diplomatic_text if master.ocr else "") or ""
40
  translation = (master.translation.fr if master.translation else "") or ""
41
  tags = _extract_tags(master)
 
42
 
43
  if existing:
44
  existing.corpus_profile = master.corpus_profile
@@ -47,6 +57,7 @@ async def index_page(db: AsyncSession, master: PageMaster) -> None:
47
  existing.diplomatic_text = diplomatic
48
  existing.translation_fr = translation
49
  existing.tags = tags
 
50
  else:
51
  entry = PageSearchIndex(
52
  page_id=master.page_id,
@@ -56,6 +67,7 @@ async def index_page(db: AsyncSession, master: PageMaster) -> None:
56
  diplomatic_text=diplomatic,
57
  translation_fr=translation,
58
  tags=tags,
 
59
  )
60
  db.add(entry)
61
 
@@ -64,33 +76,35 @@ async def index_page(db: AsyncSession, master: PageMaster) -> None:
64
 
65
 
66
  async def search_pages(db: AsyncSession, query: str, limit: int = 200) -> list[dict]:
67
- """Recherche plein texte dans l'index.
68
 
69
- Utilise LIKE avec normalisation (pas FTS5 natif) car SQLite FTS5
70
- necessite une table virtuelle separee qui complique les migrations.
71
- Cette approche est O(n) sur la table mais bien plus rapide que le
72
- scan filesystem car les donnees sont deja en memoire SQLite.
73
  """
74
  query_norm = _normalize(query.strip())
75
  if not query_norm:
76
  return []
77
 
78
- # Search using normalized LIKE across all text columns
79
- # We concatenate and normalize in Python for accent-insensitive search
 
 
80
  result = await db.execute(
81
  text("""
82
  SELECT page_id, corpus_profile, manuscript_id, folio_label,
83
  diplomatic_text, translation_fr, tags
84
  FROM page_search
85
- """)
 
 
86
  )
87
  rows = result.fetchall()
88
 
 
89
  hits: list[dict] = []
90
  for row in rows:
91
  page_id, corpus_profile, manuscript_id, folio_label, diplo, trans, tags = row
92
 
93
- # Score: count occurrences across all fields
94
  score = 0
95
  excerpt = ""
96
  for field_text in [diplo, trans, tags]:
@@ -111,15 +125,14 @@ async def search_pages(db: AsyncSession, query: str, limit: int = 200) -> list[d
111
  ex = ex + "\u2026"
112
  excerpt = ex
113
 
114
- if score > 0:
115
- hits.append({
116
- "page_id": page_id,
117
- "folio_label": folio_label,
118
- "manuscript_id": manuscript_id,
119
- "excerpt": excerpt,
120
- "score": score,
121
- "corpus_profile": corpus_profile,
122
- })
123
 
124
  hits.sort(key=lambda h: h["score"], reverse=True)
125
  return hits[:limit]
 
1
  """
2
+ Service d'indexation et de recherche pour les pages analysées.
3
+
4
+ Utilise une colonne normalized_text pré-calculée pour permettre des
5
+ recherches SQL LIKE insensibles aux accents, sans charger toutes les
6
+ lignes en Python.
7
  """
8
  import logging
9
  import unicodedata
 
36
  return " ".join(tags)
37
 
38
 
39
+ def _build_normalized_text(diplomatic: str, translation: str, tags: str) -> str:
40
+ """Construit le texte normalise concatene pour l'indexation SQL."""
41
+ return _normalize(f"{diplomatic} {translation} {tags}")
42
+
43
+
44
  async def index_page(db: AsyncSession, master: PageMaster) -> None:
45
  """Indexe ou met a jour une page dans la table de recherche."""
46
  existing = await db.get(PageSearchIndex, master.page_id)
 
48
  diplomatic = (master.ocr.diplomatic_text if master.ocr else "") or ""
49
  translation = (master.translation.fr if master.translation else "") or ""
50
  tags = _extract_tags(master)
51
+ normalized = _build_normalized_text(diplomatic, translation, tags)
52
 
53
  if existing:
54
  existing.corpus_profile = master.corpus_profile
 
57
  existing.diplomatic_text = diplomatic
58
  existing.translation_fr = translation
59
  existing.tags = tags
60
+ existing.normalized_text = normalized
61
  else:
62
  entry = PageSearchIndex(
63
  page_id=master.page_id,
 
67
  diplomatic_text=diplomatic,
68
  translation_fr=translation,
69
  tags=tags,
70
+ normalized_text=normalized,
71
  )
72
  db.add(entry)
73
 
 
76
 
77
 
78
  async def search_pages(db: AsyncSession, query: str, limit: int = 200) -> list[dict]:
79
+ """Recherche plein texte dans l'index via SQL LIKE sur normalized_text.
80
 
81
+ La colonne normalized_text est pre-calculee a l'indexation (minuscules,
82
+ sans accents). Le filtrage est fait cote SQL, pas en Python.
 
 
83
  """
84
  query_norm = _normalize(query.strip())
85
  if not query_norm:
86
  return []
87
 
88
+ # Escape SQL LIKE special characters
89
+ query_escaped = query_norm.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
90
+ like_pattern = f"%{query_escaped}%"
91
+
92
  result = await db.execute(
93
  text("""
94
  SELECT page_id, corpus_profile, manuscript_id, folio_label,
95
  diplomatic_text, translation_fr, tags
96
  FROM page_search
97
+ WHERE normalized_text LIKE :pattern ESCAPE '\\'
98
+ """),
99
+ {"pattern": like_pattern},
100
  )
101
  rows = result.fetchall()
102
 
103
+ # Score matching rows in Python (only the filtered subset, not all rows)
104
  hits: list[dict] = []
105
  for row in rows:
106
  page_id, corpus_profile, manuscript_id, folio_label, diplo, trans, tags = row
107
 
 
108
  score = 0
109
  excerpt = ""
110
  for field_text in [diplo, trans, tags]:
 
125
  ex = ex + "\u2026"
126
  excerpt = ex
127
 
128
+ hits.append({
129
+ "page_id": page_id,
130
+ "folio_label": folio_label,
131
+ "manuscript_id": manuscript_id,
132
+ "excerpt": excerpt,
133
+ "score": score,
134
+ "corpus_profile": corpus_profile,
135
+ })
 
136
 
137
  hits.sort(key=lambda h: h["score"], reverse=True)
138
  return hits[:limit]
backend/tests/test_api_corrections.py CHANGED
@@ -337,7 +337,7 @@ async def test_history_with_archived_files(async_client, db_session, tmp_path, m
337
  page = await _create_page(db_session, ms.id)
338
 
339
  # Crée le répertoire avec des fichiers de version
340
- page_dir = tmp_path / "corpora" / corpus.slug / "pages" / page.id
341
  page_dir.mkdir(parents=True)
342
  (page_dir / "master_v1.json").write_text(_make_master(page.id, version=1, status="machine_draft"))
343
  (page_dir / "master_v2.json").write_text(_make_master(page.id, version=2, status="reviewed"))
 
337
  page = await _create_page(db_session, ms.id)
338
 
339
  # Crée le répertoire avec des fichiers de version
340
+ page_dir = tmp_path / "corpora" / corpus.slug / "pages" / page.folio_label
341
  page_dir.mkdir(parents=True)
342
  (page_dir / "master_v1.json").write_text(_make_master(page.id, version=1, status="machine_draft"))
343
  (page_dir / "master_v2.json").write_text(_make_master(page.id, version=2, status="reviewed"))
backend/tests/test_api_export.py CHANGED
@@ -111,16 +111,16 @@ def _make_master_json(page_id: str, folio_label: str, sequence: int) -> str:
111
  def _mock_master_files(monkeypatch, pages):
112
  """Patche Path.exists / Path.read_text pour simuler les master.json."""
113
  master_data = {
114
- p.id: _make_master_json(p.id, p.folio_label, p.sequence)
115
  for p in pages
116
  }
117
 
118
  def fake_exists(self: Path) -> bool:
119
- return any(p_id in str(self) for p_id in master_data)
120
 
121
  def fake_read_text(self: Path, **kwargs) -> str:
122
- for p_id, data in master_data.items():
123
- if p_id in str(self):
124
  return data
125
  raise FileNotFoundError(str(self))
126
 
 
111
  def _mock_master_files(monkeypatch, pages):
112
  """Patche Path.exists / Path.read_text pour simuler les master.json."""
113
  master_data = {
114
+ p.folio_label: _make_master_json(p.id, p.folio_label, p.sequence)
115
  for p in pages
116
  }
117
 
118
  def fake_exists(self: Path) -> bool:
119
+ return any(label in str(self) for label in master_data)
120
 
121
  def fake_read_text(self: Path, **kwargs) -> str:
122
+ for label, data in master_data.items():
123
+ if label in str(self):
124
  return data
125
  raise FileNotFoundError(str(self))
126
 
backend/tests/test_api_search.py CHANGED
@@ -15,6 +15,7 @@ import pytest
15
 
16
  # 3. local
17
  from app.models.page_search import PageSearchIndex
 
18
  from tests.conftest_api import async_client, db_session # noqa: F401
19
 
20
 
@@ -40,6 +41,7 @@ async def _index_page(
40
  diplomatic_text=diplomatic_text,
41
  translation_fr=translation_fr,
42
  tags=tags,
 
43
  )
44
  db.add(entry)
45
  await db.commit()
 
15
 
16
  # 3. local
17
  from app.models.page_search import PageSearchIndex
18
+ from app.services.search.indexer import _build_normalized_text
19
  from tests.conftest_api import async_client, db_session # noqa: F401
20
 
21
 
 
41
  diplomatic_text=diplomatic_text,
42
  translation_fr=translation_fr,
43
  tags=tags,
44
+ normalized_text=_build_normalized_text(diplomatic_text, translation_fr, tags),
45
  )
46
  db.add(entry)
47
  await db.commit()
backend/tests/test_job_runner.py CHANGED
@@ -132,7 +132,6 @@ def _page_master(page_id: str, ms_id: str) -> PageMaster:
132
  image={
133
  "master": "https://example.com/image.jpg",
134
  "derivative_web": "/tmp/deriv.jpg",
135
- "iiif_base": "",
136
  "width": 2000,
137
  "height": 3000,
138
  },
 
132
  image={
133
  "master": "https://example.com/image.jpg",
134
  "derivative_web": "/tmp/deriv.jpg",
 
135
  "width": 2000,
136
  "height": 3000,
137
  },
backend/tests/test_schemas.py CHANGED
@@ -60,7 +60,6 @@ def minimal_page_master() -> dict:
60
  "image": {
61
  "master": "data/corpora/test/masters/0001r.tif",
62
  "derivative_web": "data/corpora/test/derivatives/0001r.jpg",
63
- "iiif_base": "",
64
  "width": 2000,
65
  "height": 3000,
66
  },
 
60
  "image": {
61
  "master": "data/corpora/test/masters/0001r.tif",
62
  "derivative_web": "data/corpora/test/derivatives/0001r.jpg",
 
63
  "width": 2000,
64
  "height": 3000,
65
  },
frontend/src/components/CommentaryPanel.tsx CHANGED
@@ -1,23 +1,8 @@
1
  import { useState, type FC } from 'react'
2
- import type { Commentary, EditorialInfo, EditorialStatus } from '../lib/api.ts'
 
3
  import { RetroBadge, RetroButton } from './retro'
4
 
5
- const STATUS_LABELS: Record<EditorialStatus, string> = {
6
- machine_draft: 'Brouillon IA',
7
- needs_review: 'A reviser',
8
- reviewed: 'Revise',
9
- validated: 'Valide',
10
- published: 'Publie',
11
- }
12
-
13
- const STATUS_VARIANTS: Record<EditorialStatus, 'default' | 'success' | 'warning' | 'error' | 'info'> = {
14
- machine_draft: 'info',
15
- needs_review: 'warning',
16
- reviewed: 'default',
17
- validated: 'success',
18
- published: 'success',
19
- }
20
-
21
  interface Props {
22
  commentary: Commentary | null
23
  editorial: EditorialInfo
 
1
  import { useState, type FC } from 'react'
2
+ import type { Commentary, EditorialInfo } from '../lib/api.ts'
3
+ import { STATUS_LABELS, STATUS_VARIANTS } from '../lib/editorial.ts'
4
  import { RetroBadge, RetroButton } from './retro'
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  interface Props {
7
  commentary: Commentary | null
8
  editorial: EditorialInfo
frontend/src/components/TranscriptionPanel.tsx CHANGED
@@ -1,23 +1,8 @@
1
  import type { FC } from 'react'
2
- import type { OCRResult, EditorialInfo, EditorialStatus } from '../lib/api.ts'
 
3
  import { RetroBadge } from './retro'
4
 
5
- const STATUS_LABELS: Record<EditorialStatus, string> = {
6
- machine_draft: 'Brouillon IA',
7
- needs_review: 'A reviser',
8
- reviewed: 'Revise',
9
- validated: 'Valide',
10
- published: 'Publie',
11
- }
12
-
13
- const STATUS_VARIANTS: Record<EditorialStatus, 'default' | 'success' | 'warning' | 'error' | 'info'> = {
14
- machine_draft: 'info',
15
- needs_review: 'warning',
16
- reviewed: 'default',
17
- validated: 'success',
18
- published: 'success',
19
- }
20
-
21
  interface Props {
22
  ocr: OCRResult | null
23
  editorial: EditorialInfo
 
1
  import type { FC } from 'react'
2
+ import type { OCRResult, EditorialInfo } from '../lib/api.ts'
3
+ import { STATUS_LABELS, STATUS_VARIANTS } from '../lib/editorial.ts'
4
  import { RetroBadge } from './retro'
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  interface Props {
7
  ocr: OCRResult | null
8
  editorial: EditorialInfo
frontend/src/components/TranslationPanel.tsx CHANGED
@@ -1,23 +1,8 @@
1
  import type { FC } from 'react'
2
- import type { Translation, EditorialInfo, EditorialStatus } from '../lib/api.ts'
 
3
  import { RetroBadge } from './retro'
4
 
5
- const STATUS_LABELS: Record<EditorialStatus, string> = {
6
- machine_draft: 'Brouillon IA',
7
- needs_review: 'A reviser',
8
- reviewed: 'Revise',
9
- validated: 'Valide',
10
- published: 'Publie',
11
- }
12
-
13
- const STATUS_VARIANTS: Record<EditorialStatus, 'default' | 'success' | 'warning' | 'error' | 'info'> = {
14
- machine_draft: 'info',
15
- needs_review: 'warning',
16
- reviewed: 'default',
17
- validated: 'success',
18
- published: 'success',
19
- }
20
-
21
  interface Props {
22
  translation: Translation | null
23
  editorial: EditorialInfo
 
1
  import type { FC } from 'react'
2
+ import type { Translation, EditorialInfo } from '../lib/api.ts'
3
+ import { STATUS_LABELS, STATUS_VARIANTS } from '../lib/editorial.ts'
4
  import { RetroBadge } from './retro'
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  interface Props {
7
  translation: Translation | null
8
  editorial: EditorialInfo
frontend/src/lib/api.ts CHANGED
@@ -40,7 +40,7 @@ export interface CorpusRunResponse {
40
  job_ids: string[]
41
  }
42
 
43
- export type JobStatus = 'pending' | 'running' | 'done' | 'failed'
44
 
45
  export interface Job {
46
  id: string
@@ -153,7 +153,6 @@ export interface ImageInfo {
153
  master: string
154
  derivative_web?: string | null
155
  thumbnail?: string | null
156
- iiif_base?: string | null
157
  iiif_service_url?: string | null
158
  manifest_url?: string | null
159
  width: number
@@ -225,7 +224,10 @@ function extractDetail(payload: unknown, fallback: string): string {
225
 
226
  async function get<T>(path: string): Promise<T> {
227
  const resp = await fetch(`${BASE_URL}${path}`)
228
- if (!resp.ok) throw new ApiError(resp.status, `HTTP ${resp.status} — ${path}`)
 
 
 
229
  return resp.json() as Promise<T>
230
  }
231
 
@@ -330,10 +332,13 @@ export interface CorpusModelConfig {
330
  updated_at: string
331
  }
332
 
333
- export const getCorpusModel = (corpusId: string): Promise<CorpusModelConfig | null> =>
334
- fetch(`${BASE_URL}/api/v1/corpora/${corpusId}/model`)
335
- .then((r) => (r.ok ? (r.json() as Promise<CorpusModelConfig>) : null))
336
- .catch(() => null)
 
 
 
337
 
338
  export const ingestImages = (
339
  corpusId: string,
 
40
  job_ids: string[]
41
  }
42
 
43
+ export type JobStatus = 'pending' | 'claimed' | 'running' | 'done' | 'failed'
44
 
45
  export interface Job {
46
  id: string
 
153
  master: string
154
  derivative_web?: string | null
155
  thumbnail?: string | null
 
156
  iiif_service_url?: string | null
157
  manifest_url?: string | null
158
  width: number
 
224
 
225
  async function get<T>(path: string): Promise<T> {
226
  const resp = await fetch(`${BASE_URL}${path}`)
227
+ if (!resp.ok) {
228
+ const payload = await resp.json().catch(() => null)
229
+ throw new ApiError(resp.status, extractDetail(payload, `HTTP ${resp.status} — ${path}`))
230
+ }
231
  return resp.json() as Promise<T>
232
  }
233
 
 
332
  updated_at: string
333
  }
334
 
335
+ export const getCorpusModel = async (corpusId: string): Promise<CorpusModelConfig | null> => {
336
+ try {
337
+ return await get<CorpusModelConfig>(`/api/v1/corpora/${corpusId}/model`)
338
+ } catch {
339
+ return null
340
+ }
341
+ }
342
 
343
  export const ingestImages = (
344
  corpusId: string,
frontend/src/lib/editorial.ts ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { EditorialStatus } from './api.ts'
2
+
3
+ export const STATUS_LABELS: Record<EditorialStatus, string> = {
4
+ machine_draft: 'Brouillon IA',
5
+ needs_review: 'A reviser',
6
+ reviewed: 'Revise',
7
+ validated: 'Valide',
8
+ published: 'Publie',
9
+ }
10
+
11
+ export const STATUS_VARIANTS: Record<EditorialStatus, 'default' | 'success' | 'warning' | 'error' | 'info'> = {
12
+ machine_draft: 'info',
13
+ needs_review: 'warning',
14
+ reviewed: 'default',
15
+ validated: 'success',
16
+ published: 'success',
17
+ }