andrehoffmann80 commited on
Commit
5bc6ac4
·
verified ·
1 Parent(s): aaac4b1

Delete src/dashboard.py

Browse files
Files changed (1) hide show
  1. src/dashboard.py +0 -957
src/dashboard.py DELETED
@@ -1,957 +0,0 @@
1
- import datetime
2
- from urllib.parse import quote
3
-
4
- import requests
5
- from lxml import etree
6
- import streamlit as st
7
-
8
- # =====================================================================
9
- # Namespaces
10
- # =====================================================================
11
-
12
- CROSSREF_NS = "http://www.crossref.org/schema/4.4.2"
13
- XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
14
- JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1"
15
- XML_NS = "http://www.w3.org/XML/1998/namespace"
16
- AI_NS = "http://www.crossref.org/AccessIndicators.xsd"
17
- MODS_NS = "http://www.loc.gov/mods/v3"
18
- XML_LANG = f"{{{XML_NS}}}lang"
19
-
20
-
21
- # =====================================================================
22
- # Hilfsfunktionen
23
- # =====================================================================
24
-
25
- def clean_text(text: str) -> str:
26
- """Bereinigt Soft-Hyphen, PDF-Trennungen, Zeilenumbrüche – sonst unverändert."""
27
- if not text:
28
- return ""
29
- return (
30
- text.replace("\u00AD", "") # Soft Hyphen
31
- .replace("­", "") # alternative Soft Hyphen
32
- .replace("\n", " ")
33
- ).strip()
34
-
35
-
36
- def get_text(node, xpath, ns):
37
- elem = node.find(xpath, namespaces=ns)
38
- return clean_text(elem.text) if elem is not None and elem.text else ""
39
-
40
-
41
- def build_dora_mods_url(base_url: str, repo_code: str, object_or_url: str) -> str:
42
- """
43
- Erzeugt MODS-URL aus einer DORA-ID wie 'wsl:41900'.
44
- Wenn schon eine http(s)-URL übergeben wird, wird sie unverändert zurückgegeben.
45
- Standardmäßig wird admin.dora.lib4ri.ch für den Download verwendet.
46
- """
47
- if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
48
- return object_or_url
49
- encoded = quote(object_or_url, safe="")
50
- base_url = base_url.rstrip("/")
51
- return f"{base_url}/{repo_code}/islandora/object/{encoded}/datastream/MODS/download"
52
-
53
-
54
- def build_persistent_url(repo_code: str, object_id: str) -> str:
55
- """
56
- Erzeugt die neue persistente URL im Format:
57
- https://www.dora.lib4ri.ch/{repo}/item/{id}
58
- """
59
- # Force public domain for persistent links
60
- public_base = "https://www.dora.lib4ri.ch"
61
- return f"{public_base}/{repo_code}/item/{object_id}"
62
-
63
-
64
- def fetch_mods_xml(mods_url: str) -> etree._Element:
65
- """Lädt eine MODS-Datei von einer URL und gibt den Root-Element zurück."""
66
- resp = requests.get(mods_url)
67
- resp.raise_for_status()
68
- # Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
69
- parser = etree.XMLParser(recover=True, remove_blank_text=True)
70
- return etree.fromstring(resp.content, parser=parser)
71
-
72
-
73
- def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
74
- """Extrahiert Buch-Metadaten aus einem Buch-MODS-Record."""
75
- ns = book_root.nsmap.copy()
76
- if "mods" not in ns:
77
- ns["mods"] = MODS_NS
78
-
79
- # Buchtitel
80
- book_title = get_text(book_root, ".//mods:titleInfo/mods:title", ns)
81
-
82
- # Serie (falls vorhanden)
83
- series_title = get_text(
84
- book_root,
85
- ".//mods:relatedItem[@type='series']/mods:titleInfo/mods:title",
86
- ns
87
- )
88
- series_issn = get_text(
89
- book_root,
90
- ".//mods:relatedItem[@type='series']/mods:identifier[@type='issn']",
91
- ns
92
- )
93
-
94
- # Herausgeber (editor) & Autoren (author - für Monographs)
95
- editors = []
96
- authors = []
97
- for name in book_root.findall(".//mods:name[@type='personal']", ns):
98
- role = name.find("mods:role/mods:roleTerm", ns)
99
- if role is not None:
100
- role_text = role.text.lower()
101
- if role_text == "editor":
102
- given = get_text(name, "mods:namePart[@type='given']", ns)
103
- family = get_text(name, "mods:namePart[@type='family']", ns)
104
- editors.append({"given": given, "family": family})
105
- elif role_text == "author":
106
- given = get_text(name, "mods:namePart[@type='given']", ns)
107
- family = get_text(name, "mods:namePart[@type='family']", ns)
108
- # Authors at book level (for Monographs)
109
- authors.append({"given": given, "family": family})
110
-
111
- # Publisher
112
- publisher_name = get_text(book_root, ".//mods:originInfo/mods:publisher", ns)
113
-
114
- # Publikationsjahr (online)
115
- pub_year = get_text(
116
- book_root,
117
- ".//mods:originInfo/mods:dateIssued[@encoding='w3cdtf'][@keyDate='yes']",
118
- ns
119
- )
120
- if not pub_year:
121
- pub_year = get_text(book_root, ".//mods:originInfo/mods:dateIssued", ns)
122
-
123
- # DOI & URI
124
- book_doi = get_text(book_root, ".//mods:identifier[@type='doi']", ns)
125
-
126
- # Persistent URL format
127
- # Example: https://www.dora.lib4ri.ch/psi/item/psi:84778
128
- book_id = get_text(book_root, ".//mods:identifier[@type='local']", ns)
129
- if not book_id:
130
- # Fallback to building ID from DOI if possible, or use a placeholder
131
- book_id = book_doi.split("/")[-1] if book_doi else ""
132
-
133
- # Get repo_code from the ID itself (e.g. 'psi' from 'psi:84778')
134
- current_repo = book_id.split(":")[0] if ":" in book_id else repo_base_url.split("/")[-1]
135
- book_resource = build_persistent_url(current_repo, book_id) if book_id else ""
136
-
137
- # ISBN / noisbn
138
- isbn_val = get_text(book_root, ".//mods:identifier[@type='isbn']", ns)
139
- noisbn_reason = "archive_volume" if not isbn_val else None
140
-
141
- # Default to current date if not found/provided
142
- today = datetime.date.today()
143
-
144
- meta = {
145
- "book_title": book_title,
146
- "series_title": series_title or "",
147
- "series_issn": series_issn or "",
148
- "publisher_name": publisher_name,
149
- "pub_year": int(pub_year[:4]) if pub_year else today.year,
150
- "pub_month": str(today.month),
151
- "pub_day": str(today.day),
152
- "noisbn_reason": noisbn_reason or "",
153
- "book_doi": book_doi or "",
154
- "book_resource": book_resource or "",
155
- "report_number": "",
156
- "editors": editors,
157
- "authors": authors,
158
- }
159
- return meta
160
-
161
-
162
- def mods_to_content_item(mods_root: etree._Element, repo_base_url: str) -> tuple[etree._Element, int]:
163
- """Wandelt ein Kapitel-MODS in ein Crossref <content_item> um."""
164
- ns = mods_root.nsmap.copy()
165
- if "mods" not in ns:
166
- ns["mods"] = MODS_NS
167
-
168
- title = get_text(mods_root, ".//mods:titleInfo/mods:title", ns)
169
- doi = get_text(mods_root, ".//mods:identifier[@type='doi']", ns)
170
- year = get_text(mods_root, ".//mods:originInfo/mods:dateIssued", ns)
171
- abstract = get_text(mods_root, ".//mods:abstract", ns)
172
- first_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:start", ns)
173
- last_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:end", ns)
174
-
175
- # Autoren
176
- authors = []
177
- for name in mods_root.findall(".//mods:name[@type='personal']", ns):
178
- role = name.find("mods:role/mods:roleTerm", ns)
179
- if role is not None and role.text == "author":
180
- given = get_text(name, "mods:namePart[@type='given']", ns)
181
- family = get_text(name, "mods:namePart[@type='family']", ns)
182
- authors.append((given, family))
183
-
184
- ci = etree.Element("content_item", component_type="chapter")
185
-
186
- # Contributors
187
- contribs = etree.SubElement(ci, "contributors")
188
- for idx, (given, family) in enumerate(authors):
189
- pn = etree.SubElement(
190
- contribs,
191
- "person_name",
192
- sequence="first" if idx == 0 else "additional",
193
- contributor_role="author",
194
- )
195
- etree.SubElement(pn, "given_name").text = given
196
- etree.SubElement(pn, "surname").text = family
197
-
198
- # Titel
199
- titles = etree.SubElement(ci, "titles")
200
- etree.SubElement(titles, "title").text = title
201
-
202
- # Abstract (JATS)
203
- jats_abs = etree.SubElement(ci, f"{{{JATS_NS}}}abstract", {XML_LANG: "en"})
204
- p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p")
205
- p.text = abstract
206
-
207
- # Publikationsdatum
208
- pub = etree.SubElement(ci, "publication_date", media_type="online")
209
- if year:
210
- etree.SubElement(pub, "year").text = year[:4]
211
-
212
- # Seiten
213
- if first_page or last_page:
214
- pages = etree.SubElement(ci, "pages")
215
- if first_page:
216
- etree.SubElement(pages, "first_page").text = first_page
217
- if last_page:
218
- etree.SubElement(pages, "last_page").text = last_page
219
-
220
- # License information (AccessIndicators) - must come before doi_data
221
- ai_program = etree.SubElement(ci, f"{{{AI_NS}}}program", name="AccessIndicators")
222
- license_ref = etree.SubElement(ai_program, f"{{{AI_NS}}}license_ref")
223
- license_ref.text = "https://creativecommons.org/licenses/by/4.0/"
224
- license_ref.set("applies_to", "vor")
225
- license_ref.set("start_date", year[:4] + "-01-01" if year else "")
226
-
227
- # DOI
228
- if doi:
229
- doi_data = etree.SubElement(ci, "doi_data")
230
- etree.SubElement(doi_data, "doi").text = doi
231
-
232
- # New persistent URL format
233
- chapter_id = doi.split("/")[-1] if "/" in doi else doi
234
- repo_code_extracted = chapter_id.split(":")[0] if ":" in chapter_id else repo_base_url.split("/")[-1]
235
- etree.SubElement(
236
- doi_data,
237
- "resource"
238
- ).text = build_persistent_url(repo_code_extracted, chapter_id)
239
-
240
- # Sortierung nach first_page
241
- try:
242
- page_number = int(first_page)
243
- except Exception:
244
- page_number = 999999
245
-
246
- return ci, page_number
247
-
248
-
249
- def build_doi_batch_xml(
250
- book_meta: dict,
251
- depositor_meta: dict,
252
- chapter_items: list[tuple[etree._Element, int]],
253
- book_type: str = "edited_book",
254
- ) -> bytes:
255
- """
256
- Erzeugt Crossref-<doi_batch>.
257
- book_type: 'edited_book', 'monograph', oder 'report-paper' (custom internal flag).
258
- """
259
- doi_batch = etree.Element(
260
- "doi_batch",
261
- nsmap={
262
- None: CROSSREF_NS,
263
- "xsi": XSI_NS,
264
- "jats": JATS_NS,
265
- "ai": AI_NS,
266
- }
267
- )
268
- doi_batch.set("version", "4.4.2")
269
- doi_batch.set(
270
- f"{{{XSI_NS}}}schemaLocation",
271
- "http://www.crossref.org/schema/4.4.2 "
272
- "http://www.crossref.org/schema/deposit/crossref4.4.2.xsd"
273
- )
274
-
275
- # HEAD
276
- head = etree.SubElement(doi_batch, "head")
277
- etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
278
-
279
- ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
280
- etree.SubElement(head, "timestamp").text = ts
281
-
282
- depositor = etree.SubElement(head, "depositor")
283
- etree.SubElement(depositor, "depositor_name").text = depositor_meta["depositor_name"]
284
- etree.SubElement(depositor, "email_address").text = depositor_meta["depositor_email"]
285
-
286
- etree.SubElement(head, "registrant").text = depositor_meta["registrant"]
287
-
288
- # BODY
289
- body = etree.SubElement(doi_batch, "body")
290
-
291
- # Determine structure based on book_type
292
- if book_type == "report-paper":
293
- report_paper = etree.SubElement(body, "report-paper")
294
- # Decide between report-paper_metadata and report-paper_series_metadata
295
- has_series = (book_meta.get("series_title") or book_meta.get("series_issn"))
296
- if has_series:
297
- metadata_root = etree.SubElement(report_paper, "report-paper_series_metadata")
298
- # 1. SERIES METADATA (Required if using report-paper_series_metadata)
299
- series_metadata = etree.SubElement(metadata_root, "series_metadata")
300
- if book_meta.get("series_title"):
301
- stitles = etree.SubElement(series_metadata, "titles")
302
- etree.SubElement(stitles, "title").text = book_meta["series_title"]
303
- if book_meta.get("series_issn"):
304
- etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
305
- else:
306
- metadata_root = etree.SubElement(report_paper, "report-paper_metadata")
307
- else:
308
- # BOOK STRUCTURE (Edited Book or Monograph)
309
- book = etree.SubElement(body, "book", book_type=book_type)
310
- # If it's a monograph or edited book, we often use book_series_metadata or book_metadata
311
- # For simplicity and to match the schema, let's stick to book_series_metadata if series exists
312
- if book_meta.get("series_title") or book_meta.get("series_issn"):
313
- metadata_root = etree.SubElement(book, "book_series_metadata")
314
- series_metadata = etree.SubElement(metadata_root, "series_metadata")
315
- if book_meta.get("series_title"):
316
- stitles = etree.SubElement(series_metadata, "titles")
317
- etree.SubElement(stitles, "title").text = book_meta["series_title"]
318
- if book_meta.get("series_issn"):
319
- etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
320
- else:
321
- metadata_root = etree.SubElement(book, "book_metadata")
322
-
323
- # 2. CONTRIBUTORS
324
- # Contributors (Editors or Authors)
325
- contributors_list = []
326
- role = "editor"
327
- if book_type in ["monograph", "report-paper"]:
328
- contributors_list = book_meta.get("authors", [])
329
- role = "author"
330
- else:
331
- contributors_list = book_meta.get("editors", [])
332
- role = "editor"
333
-
334
- if contributors_list:
335
- contribs = etree.SubElement(metadata_root, "contributors")
336
- for idx, person in enumerate(contributors_list):
337
- pn = etree.SubElement(
338
- contribs,
339
- "person_name",
340
- sequence="first" if idx == 0 else "additional",
341
- contributor_role=role
342
- )
343
- etree.SubElement(pn, "given_name").text = person["given"]
344
- etree.SubElement(pn, "surname").text = person["family"]
345
-
346
- # 3. TITLES
347
- titles = etree.SubElement(metadata_root, "titles")
348
- etree.SubElement(titles, "title").text = book_meta["book_title"]
349
-
350
- # 4. PUBLICATION DATE
351
- pub = etree.SubElement(metadata_root, "publication_date", media_type="online")
352
- if book_meta.get("pub_month") and book_meta.get("pub_month").strip():
353
- try:
354
- etree.SubElement(pub, "month").text = f"{int(book_meta['pub_month']):02d}"
355
- except ValueError:
356
- pass
357
- if book_meta.get("pub_day") and book_meta.get("pub_day").strip():
358
- try:
359
- etree.SubElement(pub, "day").text = f"{int(book_meta['pub_day']):02d}"
360
- except ValueError:
361
- pass
362
- etree.SubElement(pub, "year").text = str(book_meta["pub_year"])
363
-
364
- # 5. NOISBN (only for books)
365
- if book_type != "report-paper":
366
- if book_meta.get("noisbn_reason"):
367
- etree.SubElement(metadata_root, "noisbn", reason=book_meta["noisbn_reason"])
368
-
369
- # 6. PUBLISHER
370
- pub_node = etree.SubElement(metadata_root, "publisher")
371
- etree.SubElement(pub_node, "publisher_name").text = book_meta["publisher_name"]
372
-
373
- # 7. PUBLISHER ITEM (Report Number) - Only for report-paper
374
- if book_type == "report-paper" and book_meta.get("report_number"):
375
- publisher_item = etree.SubElement(metadata_root, "publisher_item")
376
- etree.SubElement(publisher_item, "identifier", id_type="report-number").text = book_meta["report_number"]
377
-
378
- # 8. DOI DATA
379
- if book_meta.get("book_doi") or book_meta.get("book_resource"):
380
- doi_data = etree.SubElement(metadata_root, "doi_data")
381
- if book_meta.get("book_doi"):
382
- etree.SubElement(doi_data, "doi").text = book_meta["book_doi"]
383
- if book_meta.get("book_resource"):
384
- etree.SubElement(doi_data, "resource").text = book_meta["book_resource"]
385
-
386
- # 10. COMPONENTS (Chapters)
387
- # Sort and append chapters
388
- if book_type != "report-paper":
389
- # For books, chapters are children of <book> node
390
- # But wait, in the loop below we append to 'book' variable.
391
- # 'book' variable is only defined if book_type != 'report-paper'.
392
- pass
393
-
394
- chapter_items.sort(key=lambda x: x[1])
395
- for ci, _page in chapter_items:
396
- if book_type == "report-paper":
397
- report_paper.append(ci)
398
- else:
399
- book.append(ci)
400
-
401
- xml_bytes = etree.tostring(
402
- doi_batch,
403
- pretty_print=True,
404
- encoding="UTF-8",
405
- xml_declaration=True
406
- )
407
- return xml_bytes
408
-
409
-
410
- class CrossrefSchemaResolver(etree.Resolver):
411
- """Custom resolver to fetch included XSD schemas from Crossref and W3C."""
412
-
413
- def resolve(self, url, id, context):
414
- # Map of known schema locations
415
- schema_map = {
416
- 'mathml3-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-content.xsd',
417
- 'mathml3-presentation.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-presentation.xsd',
418
- 'mathml3-strict-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-strict-content.xsd',
419
- 'mathml3-common.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-common.xsd',
420
- }
421
-
422
- # Determine the URL to fetch
423
- if url.startswith("http://") or url.startswith("https://"):
424
- schema_url = url
425
- elif url in schema_map:
426
- schema_url = schema_map[url]
427
- else:
428
- schema_url = f"https://www.crossref.org/schemas/{url}"
429
-
430
- try:
431
- response = requests.get(schema_url, timeout=15)
432
- response.raise_for_status()
433
- return self.resolve_string(response.content, context)
434
- except Exception:
435
- # If fetching fails, return None to use default behavior
436
- return None
437
-
438
-
439
- def validate_crossref_xml(xml_bytes: bytes) -> tuple[bool, list[str]]:
440
- """
441
- Validiert Crossref XML gegen das offizielle XSD Schema.
442
-
443
- Returns:
444
- tuple: (is_valid, error_messages)
445
- """
446
- errors = []
447
-
448
- try:
449
- # Parse XML
450
- doc = etree.fromstring(xml_bytes)
451
-
452
- # Crossref XSD Schema URL
453
- schema_url = "https://www.crossref.org/schemas/crossref4.4.2.xsd"
454
-
455
- # Download schema (mit Caching in Session State für Performance)
456
- if 'crossref_schema' not in st.session_state:
457
- try:
458
- # Create parser with custom resolver
459
- parser = etree.XMLParser()
460
- parser.resolvers.add(CrossrefSchemaResolver())
461
-
462
- # Download main schema
463
- schema_resp = requests.get(schema_url, timeout=30)
464
- schema_resp.raise_for_status()
465
-
466
- # Parse schema with resolver
467
- schema_doc = etree.fromstring(schema_resp.content, parser)
468
- st.session_state.crossref_schema = etree.XMLSchema(schema_doc)
469
- except Exception as e:
470
- errors.append(f"Fehler beim Laden des XSD Schemas: {e}")
471
- return False, errors
472
-
473
- schema = st.session_state.crossref_schema
474
-
475
- # Validierung
476
- is_valid = schema.validate(doc)
477
-
478
- if not is_valid:
479
- for error in schema.error_log:
480
- errors.append(f"Zeile {error.line}: {error.message}")
481
-
482
- return is_valid, errors
483
-
484
- except etree.XMLSyntaxError as e:
485
- errors.append(f"XML Syntax Fehler: {e}")
486
- return False, errors
487
- except Exception as e:
488
- errors.append(f"Unerwarteter Fehler: {e}")
489
- return False, errors
490
-
491
-
492
- # =====================================================================
493
- # REPOSITORY CONFIGURATION
494
- # =====================================================================
495
-
496
- REPO_CONFIG = {
497
- "wsl": {
498
- "publisher": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
499
- "registrant": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
500
- "prefix": "10.55419",
501
- "role": "wslx"
502
- },
503
- "psi": {
504
- "publisher": "Paul Scherrer Institute, PSI",
505
- "registrant": "Paul Scherrer Institute, PSI",
506
- "prefix": "10.55402",
507
- "role": "psit"
508
- },
509
- "empa": {
510
- "publisher": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
511
- "registrant": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
512
- "prefix": "10.55368",
513
- "role": "empa"
514
- },
515
- "eawag": {
516
- "publisher": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
517
- "registrant": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
518
- "prefix": "10.55408",
519
- "role": "eawa"
520
- }
521
- }
522
-
523
- def main():
524
- st.title("Crossref XML Generator/Uploader")
525
-
526
- st.markdown(
527
- "Dieses Dashboard lädt **MODS-Metadaten direkt aus DORA** mittels IDs "
528
- "und erzeugt ein vollständiges Crossref-XML (`doi_batch`) für Reports (WSL Berichte und PSI Berichte) und Edited Books/Conference Proceedings."
529
- )
530
-
531
- st.subheader("Konfiguration & Quelle")
532
-
533
- col_config, col_source = st.columns(2)
534
-
535
- with col_config:
536
- st.markdown("#### Verbindung & Typ")
537
- base_url = st.text_input(
538
- "DORA Basis-URL",
539
- value="https://admin.dora.lib4ri.ch"
540
- )
541
-
542
- repo_list = list(REPO_CONFIG.keys())
543
- repo_code = st.selectbox(
544
- "Repository-Code",
545
- options=repo_list,
546
- index=0,
547
- format_func=lambda x: x.upper()
548
- )
549
-
550
- repo_config = REPO_CONFIG[repo_code]
551
- repo_base_url = f"{base_url.rstrip('/')}/{repo_code}"
552
-
553
- pub_type = st.radio(
554
- "Publikationstyp",
555
- ("Edited Book", "Report (WSL, Monograph Series)", "Report (Eawag, PSI, Paper Series)"),
556
- horizontal=False
557
- )
558
-
559
- # Mapping auf Crossref book_type / report type
560
- cr_book_type = "edited_book"
561
- if "Monograph" in pub_type:
562
- cr_book_type = "monograph"
563
- elif "Paper Series" in pub_type:
564
- cr_book_type = "report-paper"
565
-
566
- with col_source:
567
- st.markdown("#### MODS-Quelle")
568
- # Dynamic default ID based on repo
569
- default_id = "41891"
570
- if repo_code == "psi":
571
- default_id = "84057"
572
-
573
- book_id_or_url = st.text_input(
574
- "DORA-ID oder MODS-URL",
575
- value=f"{repo_code}:{default_id}",
576
- help="Beispiel: wsl:41900 oder komplette URL"
577
- )
578
-
579
- st.write("") # Spacer
580
- if st.button("Metadaten laden", type="primary"):
581
- try:
582
- mods_url = build_dora_mods_url(base_url, repo_code, book_id_or_url)
583
- st.info(f"Lade MODS von: {mods_url}")
584
- book_root = fetch_mods_xml(mods_url)
585
- meta = parse_book_mods(book_root, repo_base_url)
586
-
587
- # --- Attempt to extract report number from MODS ---
588
- ns = book_root.nsmap.copy()
589
- if "mods" not in ns:
590
- ns["mods"] = MODS_NS
591
- report_num = get_text(book_root, ".//mods:identifier[@type='report number']", ns)
592
- if not report_num:
593
- report_num = get_text(book_root, ".//mods:identifier[@type='report-number']", ns)
594
-
595
- if not report_num:
596
- # Check <note type="report number">
597
- report_num = get_text(book_root, ".//mods:note[@type='report number']", ns)
598
-
599
- if report_num:
600
- meta["report_number"] = report_num
601
- st.info(f"Report Number gefunden: {report_num}")
602
- # --------------------------------------------------
603
-
604
- # Update flat fields in session state for widgets
605
- for k, v in meta.items():
606
- if k in ["book_title", "series_title", "series_issn", "publisher_name",
607
- "pub_year", "pub_month", "pub_day", "noisbn_reason",
608
- "book_doi", "book_resource", "report_number"]:
609
- st.session_state[k] = v
610
- st.session_state.book_meta[k] = v
611
-
612
- # Special handling for persons text area
613
- if cr_book_type in ["monograph", "report-paper"]:
614
- current_list = meta.get("authors", [])
615
- else:
616
- current_list = meta.get("editors", [])
617
- st.session_state["persons_input"] = "\n".join(f"{e['given']};{e['family']}" for e in current_list)
618
-
619
- st.session_state.book_meta_loaded = True
620
- st.success("Metadaten erfolgreich geladen.")
621
- st.rerun()
622
- except Exception as e:
623
- st.error(f"Fehler beim Laden der MODS: {e}")
624
- import traceback
625
- st.text(traceback.format_exc())
626
-
627
- # Session State Init Logic (unchanged but placed after UI definition for clarity in reading flow, strictly it runs before inputs generally)
628
- if "book_meta_loaded" not in st.session_state:
629
- st.session_state.book_meta_loaded = False
630
-
631
- # Current date for defaults
632
- today = datetime.date.today()
633
-
634
- # Initialize session state keys for widgets if not present
635
- if "book_title" not in st.session_state:
636
- st.session_state.book_title = ""
637
- if "series_title" not in st.session_state:
638
- st.session_state.series_title = ""
639
- if "series_issn" not in st.session_state:
640
- st.session_state.series_issn = ""
641
- if "publisher_name" not in st.session_state:
642
- st.session_state.publisher_name = repo_config["publisher"]
643
- if "pub_year" not in st.session_state:
644
- st.session_state.pub_year = today.year
645
- if "pub_month" not in st.session_state:
646
- st.session_state.pub_month = str(today.month)
647
- if "pub_day" not in st.session_state:
648
- st.session_state.pub_day = str(today.day)
649
- if "noisbn_reason" not in st.session_state:
650
- st.session_state.noisbn_reason = ""
651
- if "book_doi" not in st.session_state:
652
- st.session_state.book_doi = ""
653
- if "book_resource" not in st.session_state:
654
- st.session_state.book_resource = ""
655
- if "report_number" not in st.session_state:
656
- st.session_state.report_number = ""
657
- if "persons_input" not in st.session_state:
658
- st.session_state.persons_input = ""
659
-
660
- if "book_meta" not in st.session_state:
661
- st.session_state.book_meta = {
662
- "book_title": "",
663
- "series_title": "",
664
- "series_issn": "",
665
- "publisher_name": repo_config["publisher"],
666
- "pub_year": today.year,
667
- "pub_month": str(today.month),
668
- "pub_day": str(today.day),
669
- "noisbn_reason": "",
670
- "book_doi": "",
671
- "book_resource": "",
672
- "report_number": "",
673
- "editors": [],
674
- "authors": [],
675
- }
676
-
677
- # CHECK: has the repo code changed since last run?
678
- if "last_repo_code" not in st.session_state:
679
- st.session_state.last_repo_code = repo_code
680
- st.session_state.registrant = repo_config["registrant"]
681
- st.session_state.cr_role = repo_config.get("role", "")
682
-
683
- if st.session_state.last_repo_code != repo_code:
684
- # Repo changed! Update defaults
685
- st.session_state.publisher_name = repo_config["publisher"]
686
- st.session_state.book_meta["publisher_name"] = repo_config["publisher"]
687
- st.session_state.registrant = repo_config["registrant"]
688
-
689
- # If the user hasn't typed anything yet or if we force update?
690
- # Let's force update the role in session state so the input widget picks it up
691
- st.session_state.cr_role = repo_config.get("role", "")
692
-
693
- st.session_state.last_repo_code = repo_code
694
-
695
- st.markdown("---")
696
- st.subheader("Metadaten & Inhalte")
697
-
698
- # Use expander for metadata editing to keep UI clean
699
- with st.expander("Metadaten bearbeiten", expanded=True):
700
- bm = st.session_state.book_meta
701
-
702
- col_b1, col_b2 = st.columns(2)
703
- with col_b1:
704
- st.text_input("Titel", key="book_title")
705
- st.text_input("Serientitel", key="series_title")
706
- st.text_input("Serien-ISSN", key="series_issn")
707
- st.text_input("Publisher Name", key="publisher_name")
708
-
709
- if cr_book_type == "report-paper":
710
- st.text_input("Report Number", key="report_number")
711
-
712
- with col_b2:
713
- c_y, c_m, c_d = st.columns(3)
714
- with c_y:
715
- st.number_input("Jahr", min_value=1900, max_value=2100, key="pub_year")
716
- with c_m:
717
- st.text_input("Monat", key="pub_month")
718
- with c_d:
719
- st.text_input("Tag", key="pub_day")
720
-
721
- if cr_book_type != "report-paper":
722
- st.text_input("noisbn reason", key="noisbn_reason")
723
-
724
- st.markdown("##### Identifikatoren")
725
- col_id1, col_id2 = st.columns(2)
726
- with col_id1:
727
- st.text_input("DOI", key="book_doi")
728
- with col_id2:
729
- st.text_input("Resource URL", key="book_resource")
730
-
731
- st.caption(f"Basis DOI Prefix: {repo_config['prefix']}")
732
-
733
- st.markdown("##### Mitwirkende")
734
- # Decide label based on type
735
- if cr_book_type in ["monograph", "report-paper"]:
736
- st.info("Bitte **Autoren** eintragen (Vorname;Nachname).")
737
- label = "Autoren"
738
- else:
739
- st.info("Bitte **Editoren** eintragen (Vorname;Nachname).")
740
- label = "Editoren"
741
-
742
- persons_text = st.text_area(label, key="persons_input", height=100)
743
-
744
- # Parse and save back
745
- new_persons = []
746
- for line in persons_text.splitlines():
747
- line = line.strip()
748
- if not line:
749
- continue
750
- parts = [p.strip() for p in line.split(";")]
751
- if len(parts) == 2:
752
- new_persons.append({"given": parts[0], "family": parts[1]})
753
-
754
- if cr_book_type in ["monograph", "report-paper"]:
755
- bm["authors"] = new_persons
756
- else:
757
- bm["editors"] = new_persons
758
-
759
- st.markdown("---")
760
- st.subheader("Depositor & Batch Info")
761
-
762
- with st.expander("Depositor Details", expanded=False):
763
- col_d1, col_d2 = st.columns(2)
764
- with col_d1:
765
- depositor_name = st.text_input(
766
- "Depositor Name",
767
- value="Lib4RI - Library for the Research Institutes within the ETH Domain: Eawag, Empa, PSI & WSL"
768
- )
769
- with col_d2:
770
- depositor_email = st.text_input("Depositor Email", value="dora@lib4ri.ch")
771
-
772
- ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
773
-
774
- batch_prefix = "book"
775
- if cr_book_type == "report-paper":
776
- batch_prefix = "report"
777
- elif cr_book_type == "monograph":
778
- batch_prefix = "monograph"
779
-
780
- doi_batch_id = st.text_input(
781
- "DOI Batch ID",
782
- value=f"{batch_prefix}_{ts}",
783
- help="Wird im XML-Header verwendet. Sollte eindeutig sein."
784
- )
785
-
786
- if "registrant" not in st.session_state:
787
- st.session_state.registrant = repo_config["registrant"]
788
-
789
- registrant = st.text_input("Registrant", value=st.session_state.registrant)
790
- st.session_state.registrant = registrant
791
-
792
- depositor_meta = {
793
- "depositor_name": depositor_name,
794
- "depositor_email": depositor_email,
795
- "registrant": st.session_state.registrant,
796
- "doi_batch_id": doi_batch_id
797
- }
798
-
799
- st.subheader("Kapitel / Inhalte")
800
- st.caption("Ein Eintrag pro Zeile: ID (z.B. wsl:12345) oder URL")
801
-
802
- st.markdown(
803
- "Gib **eine DORA-ID** (z.B. `wsl:41900`) oder eine **komplette MODS-URL** "
804
- "pro Zeile ein."
805
- )
806
-
807
- chapters_text = st.text_area("Kapitel-Liste", height=200, help="Liste der IDs oder URLs")
808
-
809
- st.markdown("---")
810
- st.subheader("XML Generierung")
811
-
812
- if st.button("Crossref XML generieren", type="primary"):
813
- try:
814
- chapter_items = []
815
-
816
- for line in chapters_text.splitlines():
817
- line = line.strip()
818
- if not line:
819
- continue
820
- mods_url = build_dora_mods_url(base_url, repo_code, line)
821
- st.write(f"Lade Kapitel-MODS von: {mods_url}")
822
- mods_root = fetch_mods_xml(mods_url)
823
- ci, page_no = mods_to_content_item(mods_root, repo_base_url)
824
- chapter_items.append((ci, page_no))
825
-
826
- if not chapter_items and cr_book_type == "edited_book":
827
- st.warning("Keine Kapitel angegeben! Ein Edited Book sollte normalerweise Kapitel enthalten.")
828
-
829
- # book_meta aus session state / widgets zusammenbauen
830
- book_meta = {
831
- "book_title": st.session_state.book_title,
832
- "series_title": st.session_state.series_title,
833
- "series_issn": st.session_state.series_issn,
834
- "publisher_name": st.session_state.publisher_name,
835
- "pub_year": int(st.session_state.pub_year) if st.session_state.get("pub_year") else 0,
836
- "pub_month": st.session_state.pub_month,
837
- "pub_day": st.session_state.pub_day,
838
- "noisbn_reason": st.session_state.get("noisbn_reason", ""),
839
- "book_doi": st.session_state.book_doi,
840
- "book_resource": st.session_state.book_resource,
841
- "report_number": st.session_state.get("report_number", ""),
842
- "editors": new_persons if cr_book_type not in ["monograph", "report-paper"] else [],
843
- "authors": new_persons if cr_book_type in ["monograph", "report-paper"] else [],
844
- }
845
-
846
- xml_bytes = build_doi_batch_xml(book_meta, depositor_meta, chapter_items, book_type=cr_book_type)
847
-
848
- # Store in session state
849
- st.session_state.crossref_xml = xml_bytes
850
- st.session_state.crossref_filename = "crossref_edited_book.xml"
851
-
852
- st.success("Crossref XML erfolgreich erzeugt!")
853
-
854
- # Validierung gegen Crossref XSD Schema
855
- st.subheader("XML Validierung")
856
- with st.spinner("Validiere XML gegen Crossref Schema..."):
857
- is_valid, validation_errors = validate_crossref_xml(xml_bytes)
858
-
859
- if is_valid:
860
- st.success("✓ XML ist valide und bereit für Crossref!")
861
- else:
862
- st.error("✗ XML Validierung fehlgeschlagen:")
863
- for error in validation_errors:
864
- st.error(f" • {error}")
865
- st.warning("Das XML kann trotzdem heruntergeladen werden, wird aber möglicherweise von Crossref abgelehnt.")
866
-
867
- except Exception as e:
868
- st.error(f"Fehler bei der Erzeugung des XML: {e}")
869
- import traceback
870
- st.text(traceback.format_exc())
871
-
872
- # Display Download and Upload if XML exists in session state
873
- if "crossref_xml" in st.session_state:
874
- xml_bytes = st.session_state.crossref_xml
875
-
876
- # Download Button
877
- st.download_button(
878
- label="XML herunterladen",
879
- data=xml_bytes,
880
- file_name=st.session_state.crossref_filename,
881
- mime="application/xml"
882
- )
883
-
884
- # ---------------------------------------------------------
885
- # Crossref Upload Section
886
- # ---------------------------------------------------------
887
- st.markdown("---")
888
- st.subheader("Automatischer Upload zu Crossref")
889
-
890
- # Determine default role if not in session state
891
- if "cr_role" not in st.session_state:
892
- st.session_state.cr_role = REPO_CONFIG.get(st.session_state.last_repo_code, {}).get("role", "")
893
-
894
- col_u1, col_u2 = st.columns(2)
895
- with col_u1:
896
- cr_user = st.text_input("Crossref Username", value="dora@lib4ri.ch")
897
- # Use key to bind to session state
898
- cr_role = st.text_input("Crossref Role (wslx, empa, eawa, psit)", key="cr_role")
899
- with col_u2:
900
- cr_pass = st.text_input("Crossref Password", type="password")
901
-
902
- if st.button("Upload to Crossref"):
903
- if not cr_user or not cr_pass:
904
- st.error("Bitte Username und Passwort für Crossref angeben.")
905
- else:
906
- with st.spinner("Lade zu Crossref hoch..."):
907
- res = upload_to_crossref(xml_bytes, cr_user, cr_pass, cr_role)
908
-
909
- if isinstance(res, str) and res.startswith("Exception"):
910
- st.error(f"Upload fehlgeschlagen: {res}")
911
- else:
912
- # Crossref returns 200 even on some logic errors, text contains details
913
- if res.status_code == 200:
914
- if "successfully received" in res.text:
915
- st.success("Upload erfolgreich! Crossref hat die Datei empfangen.")
916
- with st.expander("Server-Antwort ansehen"):
917
- st.text(res.text)
918
- else:
919
- st.warning("Upload technisch erfolgreich (HTTP 200), aber Crossref meldet eventuell Fehler.")
920
- with st.expander("Server-Antwort ansehen (Fehleranalyse)"):
921
- st.text(res.text)
922
- else:
923
- st.error(f"HTTP Fehler: {res.status_code}")
924
- st.text(res.text)
925
-
926
-
927
- def upload_to_crossref(xml_content, username, password, role=None):
928
- url = "https://doi.crossref.org/servlet/deposit"
929
-
930
- # Construct login_id with role if provided (format: username/role)
931
- login_id = username
932
- if role and role.strip():
933
- login_id = f"{username}/{role.strip()}"
934
-
935
- # Multipart form data
936
- # 'operation': 'doMDUpload'
937
- # 'login_id': username (or username/role)
938
- # 'login_passwd': password
939
- # 'fname': (filename, file_content, content_type)
940
-
941
- files = {
942
- 'fname': ('crossref_submission.xml', xml_content, 'application/xml')
943
- }
944
- data = {
945
- 'operation': 'doMDUpload',
946
- 'login_id': login_id,
947
- 'login_passwd': password
948
- }
949
-
950
- try:
951
- response = requests.post(url, files=files, data=data, timeout=60)
952
- return response
953
- except Exception as e:
954
- return f"Exception: {e}"
955
-
956
- if __name__ == "__main__":
957
- main()