andrehoffmann80 commited on
Commit
a57f2bb
·
verified ·
1 Parent(s): 5bc6ac4

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +863 -145
src/streamlit_app.py CHANGED
@@ -1,82 +1,186 @@
1
- import os
 
 
 
2
  from lxml import etree
 
3
 
4
  # =====================================================================
5
- # CONFIGURATION
6
  # =====================================================================
7
 
8
- MODS_DIR = "mods_records" # Ordner mit MODS-Kapiteln
9
- OUTPUT_XML = "crossref.xml"
10
-
11
- # Namespaces
12
  JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1"
13
  XML_NS = "http://www.w3.org/XML/1998/namespace"
 
 
14
  XML_LANG = f"{{{XML_NS}}}lang"
15
 
16
- NSMAP = {
17
- "jats": JATS_NS,
18
- "xlink": "http://www.w3.org/1999/xlink",
19
- }
20
 
21
  # =====================================================================
22
- # TEXT CLEANING: Entfernt nur Silbentrennungsartefakte (Option 1)
23
  # =====================================================================
24
 
25
- def clean_text(text):
26
- """Bereinigt Soft-Hyphen, PDF-Trennungen, ersetzt aber nichts anderes."""
27
  if not text:
28
  return ""
29
  return (
30
  text.replace("\u00AD", "") # Soft Hyphen
31
  .replace("­", "") # alternative Soft Hyphen
32
- .replace("\n", " ") # Zeilenumbrüche entfernen
33
  ).strip()
34
 
35
 
36
- # =====================================================================
37
- # XML HELPERS
38
- # =====================================================================
39
-
40
  def get_text(node, xpath, ns):
41
- """Safely extract text content using an xpath."""
42
  elem = node.find(xpath, namespaces=ns)
43
  return clean_text(elem.text) if elem is not None and elem.text else ""
44
 
45
 
46
- # =====================================================================
47
- # PARSE A SINGLE MODS FILE INTO <content_item>
48
- # =====================================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- def mods_to_content_item(mods_path):
51
- tree = etree.parse(mods_path)
52
- root = tree.getroot()
53
- ns = root.nsmap
54
-
55
- # --------------------------------------------------------
56
- # Extract metadata
57
- # --------------------------------------------------------
58
- title = get_text(root, ".//mods:titleInfo/mods:title", ns)
59
- doi = get_text(root, ".//mods:identifier[@type='doi']", ns)
60
- year = get_text(root, ".//mods:originInfo/mods:dateIssued", ns)
61
- abstract = get_text(root, ".//mods:abstract", ns)
62
-
63
- first_page = get_text(root, ".//mods:extent[@unit='page']/mods:start", ns)
64
- last_page = get_text(root, ".//mods:extent[@unit='page']/mods:end", ns)
65
-
66
- # --------------------------------------------------------
67
- # Extract authors
68
- # --------------------------------------------------------
69
  authors = []
70
- for name in root.findall(".//mods:name[@type='personal']", ns):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  role = name.find("mods:role/mods:roleTerm", ns)
72
  if role is not None and role.text == "author":
73
  given = get_text(name, "mods:namePart[@type='given']", ns)
74
  family = get_text(name, "mods:namePart[@type='family']", ns)
75
  authors.append((given, family))
76
 
77
- # --------------------------------------------------------
78
- # Build <content_item>
79
- # --------------------------------------------------------
80
  ci = etree.Element("content_item", component_type="chapter")
81
 
82
  # Contributors
@@ -91,7 +195,7 @@ def mods_to_content_item(mods_path):
91
  etree.SubElement(pn, "given_name").text = given
92
  etree.SubElement(pn, "surname").text = family
93
 
94
- # Titles
95
  titles = etree.SubElement(ci, "titles")
96
  etree.SubElement(titles, "title").text = title
97
 
@@ -100,11 +204,12 @@ def mods_to_content_item(mods_path):
100
  p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p")
101
  p.text = abstract
102
 
103
- # Publication date
104
  pub = etree.SubElement(ci, "publication_date", media_type="online")
105
- etree.SubElement(pub, "year").text = year
 
106
 
107
- # Pages
108
  if first_page or last_page:
109
  pages = etree.SubElement(ci, "pages")
110
  if first_page:
@@ -112,128 +217,741 @@ def mods_to_content_item(mods_path):
112
  if last_page:
113
  etree.SubElement(pages, "last_page").text = last_page
114
 
115
- # DOI block
 
 
 
 
 
 
 
116
  if doi:
117
  doi_data = etree.SubElement(ci, "doi_data")
118
  etree.SubElement(doi_data, "doi").text = doi
119
-
120
- doi_tail = doi.split(":")[-1]
 
 
121
  etree.SubElement(
122
  doi_data,
123
  "resource"
124
- ).text = f"https://www.dora.lib4ri.ch/wsl/islandora/object/{doi_tail}"
125
 
126
- # Sorting helper: use first_page numeric value if available
127
  try:
128
  page_number = int(first_page)
129
- except:
130
  page_number = 999999
131
 
132
  return ci, page_number
133
 
134
 
135
- # =====================================================================
136
- # MAIN: Assemble full Crossref XML
137
- # =====================================================================
138
-
139
- def assemble_crossref(mods_dir, output_path):
140
-
141
- # Root <book>
142
- book = etree.Element(
143
- "book",
144
- book_type="edited_book",
145
- nsmap=NSMAP
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  )
147
 
148
- # ----------------------------------------------------------------
149
- # FIXED BOOK METADATA (Editors, Publisher, Series, DOI)
150
- # ----------------------------------------------------------------
151
- metadata = etree.XML("""
152
- <book_series_metadata>
153
- <series_metadata>
154
- <titles><title>WSL Berichte</title></titles>
155
- <issn>22963456</issn>
156
- </series_metadata>
157
-
158
- <contributors>
159
- <person_name sequence="first" contributor_role="editor">
160
- <given_name>Alexander</given_name>
161
- <surname>Bast</surname>
162
- </person_name>
163
- <person_name sequence="additional" contributor_role="editor">
164
- <given_name>Michael</given_name>
165
- <surname>Bründl</surname>
166
- </person_name>
167
- <person_name sequence="additional" contributor_role="editor">
168
- <given_name>Marcia</given_name>
169
- <surname>Phillips</surname>
170
- </person_name>
171
- </contributors>
172
-
173
- <titles>
174
- <title>WSL research programme Climate Change Impacts on Alpine Mass Movements - CCAMM project report</title>
175
- </titles>
176
-
177
- <publication_date media_type="online">
178
- <month>12</month>
179
- <day>08</day>
180
- <year>2025</year>
181
- </publication_date>
182
-
183
- <noisbn reason="archive_volume"/>
184
-
185
- <publisher>
186
- <publisher_name>Swiss Federal Institute for Forest, Snow and Landscape Research, WSL</publisher_name>
187
- </publisher>
188
-
189
- <doi_data>
190
- <doi>10.55419/wsl:41891</doi>
191
- <resource>https://www.dora.lib4ri.ch/wsl/islandora/object/wsl:41891</resource>
192
- </doi_data>
193
- </book_series_metadata>
194
- """, parser=etree.XMLParser(remove_blank_text=True))
195
-
196
- book.append(metadata)
197
-
198
- # ----------------------------------------------------------------
199
- # Process ALL MODS chapters
200
- # ----------------------------------------------------------------
201
- chapters = []
202
-
203
- for filename in sorted(os.listdir(mods_dir)):
204
- if filename.lower().endswith(".xml"):
205
- path = os.path.join(mods_dir, filename)
206
- print(f"Processing MODS file: {path}")
207
- ci, page_number = mods_to_content_item(path)
208
- chapters.append((page_number, ci))
209
-
210
- # Sort by first page
211
- chapters.sort(key=lambda x: x[0])
212
-
213
- # Append all chapter blocks
214
- for _, chapter in chapters:
215
- book.append(chapter)
216
-
217
- # ----------------------------------------------------------------
218
- # WRITE OUTPUT FILE
219
- # ----------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  xml_bytes = etree.tostring(
221
- book,
222
  pretty_print=True,
223
  encoding="UTF-8",
224
  xml_declaration=True
225
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- with open(output_path, "wb") as f:
228
- f.write(xml_bytes)
229
 
230
- print("Crossref XML successfully written to:", output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
 
233
  # =====================================================================
234
- # RUN SCRIPT
235
  # =====================================================================
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  if __name__ == "__main__":
238
- assemble_crossref(MODS_DIR, OUTPUT_XML)
239
- print("DONE.")
 
1
+ import datetime
2
+ from urllib.parse import quote
3
+
4
+ import requests
5
  from lxml import etree
6
+ import streamlit as st
7
 
8
  # =====================================================================
9
+ # Namespaces
10
  # =====================================================================
11
 
12
+ CROSSREF_NS = "http://www.crossref.org/schema/4.4.2"
13
+ XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
 
 
14
  JATS_NS = "http://www.ncbi.nlm.nih.gov/JATS1"
15
  XML_NS = "http://www.w3.org/XML/1998/namespace"
16
+ AI_NS = "http://www.crossref.org/AccessIndicators.xsd"
17
+ MODS_NS = "http://www.loc.gov/mods/v3"
18
  XML_LANG = f"{{{XML_NS}}}lang"
19
 
 
 
 
 
20
 
21
  # =====================================================================
22
+ # Hilfsfunktionen
23
  # =====================================================================
24
 
25
+ def clean_text(text: str) -> str:
26
+ """Bereinigt Soft-Hyphen, PDF-Trennungen, Zeilenumbrüche sonst unverändert."""
27
  if not text:
28
  return ""
29
  return (
30
  text.replace("\u00AD", "") # Soft Hyphen
31
  .replace("­", "") # alternative Soft Hyphen
32
+ .replace("\n", " ")
33
  ).strip()
34
 
35
 
 
 
 
 
36
  def get_text(node, xpath, ns):
 
37
  elem = node.find(xpath, namespaces=ns)
38
  return clean_text(elem.text) if elem is not None and elem.text else ""
39
 
40
 
41
+ def build_dora_mods_url(base_url: str, repo_code: str, object_or_url: str) -> str:
42
+ """
43
+ Erzeugt MODS-URL aus einer DORA-ID wie 'wsl:41900'.
44
+ Wenn schon eine http(s)-URL übergeben wird, wird sie unverändert zurückgegeben.
45
+ Standardmäßig wird admin.dora.lib4ri.ch für den Download verwendet.
46
+ """
47
+ if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
48
+ return object_or_url
49
+ encoded = quote(object_or_url, safe="")
50
+ base_url = base_url.rstrip("/")
51
+ return f"{base_url}/{repo_code}/islandora/object/{encoded}/datastream/MODS/download"
52
+
53
+
54
+ def build_persistent_url(repo_code: str, object_id: str) -> str:
55
+ """
56
+ Erzeugt die neue persistente URL im Format:
57
+ https://www.dora.lib4ri.ch/{repo}/item/{id}
58
+ """
59
+ # Force public domain for persistent links
60
+ public_base = "https://www.dora.lib4ri.ch"
61
+ return f"{public_base}/{repo_code}/item/{object_id}"
62
+
63
+
64
+ def fetch_mods_xml(mods_url: str) -> etree._Element:
65
+ """Lädt eine MODS-Datei von einer URL und gibt den Root-Element zurück."""
66
+ resp = requests.get(mods_url)
67
+ resp.raise_for_status()
68
+ # Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
69
+ parser = etree.XMLParser(recover=True, remove_blank_text=True)
70
+ return etree.fromstring(resp.content, parser=parser)
71
+
72
+
73
+ def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
74
+ """Extrahiert Buch-Metadaten aus einem Buch-MODS-Record."""
75
+ ns = book_root.nsmap.copy()
76
+ if "mods" not in ns:
77
+ ns["mods"] = MODS_NS
78
+
79
+ # Buchtitel
80
+ book_title = get_text(book_root, ".//mods:titleInfo/mods:title", ns)
81
+
82
+ # Serie (falls vorhanden)
83
+ series_title = get_text(
84
+ book_root,
85
+ ".//mods:relatedItem[@type='series']/mods:titleInfo/mods:title",
86
+ ns
87
+ )
88
+ series_issn = get_text(
89
+ book_root,
90
+ ".//mods:relatedItem[@type='series']/mods:identifier[@type='issn']",
91
+ ns
92
+ )
93
 
94
+ # Herausgeber (editor) & Autoren (author - für Monographs)
95
+ editors = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  authors = []
97
+ for name in book_root.findall(".//mods:name[@type='personal']", ns):
98
+ role = name.find("mods:role/mods:roleTerm", ns)
99
+ if role is not None:
100
+ role_text = role.text.lower()
101
+ if role_text == "editor":
102
+ given = get_text(name, "mods:namePart[@type='given']", ns)
103
+ family = get_text(name, "mods:namePart[@type='family']", ns)
104
+ editors.append({"given": given, "family": family})
105
+ elif role_text == "author":
106
+ given = get_text(name, "mods:namePart[@type='given']", ns)
107
+ family = get_text(name, "mods:namePart[@type='family']", ns)
108
+ # Authors at book level (for Monographs)
109
+ authors.append({"given": given, "family": family})
110
+
111
+ # Publisher
112
+ publisher_name = get_text(book_root, ".//mods:originInfo/mods:publisher", ns)
113
+
114
+ # Publikationsjahr (online)
115
+ pub_year = get_text(
116
+ book_root,
117
+ ".//mods:originInfo/mods:dateIssued[@encoding='w3cdtf'][@keyDate='yes']",
118
+ ns
119
+ )
120
+ if not pub_year:
121
+ pub_year = get_text(book_root, ".//mods:originInfo/mods:dateIssued", ns)
122
+
123
+ # DOI & URI
124
+ book_doi = get_text(book_root, ".//mods:identifier[@type='doi']", ns)
125
+
126
+ # Persistent URL format
127
+ # Example: https://www.dora.lib4ri.ch/psi/item/psi:84778
128
+ book_id = get_text(book_root, ".//mods:identifier[@type='local']", ns)
129
+ if not book_id:
130
+ # Fallback to building ID from DOI if possible, or use a placeholder
131
+ book_id = book_doi.split("/")[-1] if book_doi else ""
132
+
133
+ # Get repo_code from the ID itself (e.g. 'psi' from 'psi:84778')
134
+ current_repo = book_id.split(":")[0] if ":" in book_id else repo_base_url.split("/")[-1]
135
+ book_resource = build_persistent_url(current_repo, book_id) if book_id else ""
136
+
137
+ # ISBN / noisbn
138
+ isbn_val = get_text(book_root, ".//mods:identifier[@type='isbn']", ns)
139
+ noisbn_reason = "archive_volume" if not isbn_val else None
140
+
141
+ # Default to current date if not found/provided
142
+ today = datetime.date.today()
143
+
144
+ meta = {
145
+ "book_title": book_title,
146
+ "series_title": series_title or "",
147
+ "series_issn": series_issn or "",
148
+ "publisher_name": publisher_name,
149
+ "pub_year": int(pub_year[:4]) if pub_year else today.year,
150
+ "pub_month": str(today.month),
151
+ "pub_day": str(today.day),
152
+ "noisbn_reason": noisbn_reason or "",
153
+ "book_doi": book_doi or "",
154
+ "book_resource": book_resource or "",
155
+ "report_number": "",
156
+ "editors": editors,
157
+ "authors": authors,
158
+ }
159
+ return meta
160
+
161
+
162
+ def mods_to_content_item(mods_root: etree._Element, repo_base_url: str) -> tuple[etree._Element, int]:
163
+ """Wandelt ein Kapitel-MODS in ein Crossref <content_item> um."""
164
+ ns = mods_root.nsmap.copy()
165
+ if "mods" not in ns:
166
+ ns["mods"] = MODS_NS
167
+
168
+ title = get_text(mods_root, ".//mods:titleInfo/mods:title", ns)
169
+ doi = get_text(mods_root, ".//mods:identifier[@type='doi']", ns)
170
+ year = get_text(mods_root, ".//mods:originInfo/mods:dateIssued", ns)
171
+ abstract = get_text(mods_root, ".//mods:abstract", ns)
172
+ first_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:start", ns)
173
+ last_page = get_text(mods_root, ".//mods:extent[@unit='page']/mods:end", ns)
174
+
175
+ # Autoren
176
+ authors = []
177
+ for name in mods_root.findall(".//mods:name[@type='personal']", ns):
178
  role = name.find("mods:role/mods:roleTerm", ns)
179
  if role is not None and role.text == "author":
180
  given = get_text(name, "mods:namePart[@type='given']", ns)
181
  family = get_text(name, "mods:namePart[@type='family']", ns)
182
  authors.append((given, family))
183
 
 
 
 
184
  ci = etree.Element("content_item", component_type="chapter")
185
 
186
  # Contributors
 
195
  etree.SubElement(pn, "given_name").text = given
196
  etree.SubElement(pn, "surname").text = family
197
 
198
+ # Titel
199
  titles = etree.SubElement(ci, "titles")
200
  etree.SubElement(titles, "title").text = title
201
 
 
204
  p = etree.SubElement(jats_abs, f"{{{JATS_NS}}}p")
205
  p.text = abstract
206
 
207
+ # Publikationsdatum
208
  pub = etree.SubElement(ci, "publication_date", media_type="online")
209
+ if year:
210
+ etree.SubElement(pub, "year").text = year[:4]
211
 
212
+ # Seiten
213
  if first_page or last_page:
214
  pages = etree.SubElement(ci, "pages")
215
  if first_page:
 
217
  if last_page:
218
  etree.SubElement(pages, "last_page").text = last_page
219
 
220
+ # License information (AccessIndicators) - must come before doi_data
221
+ ai_program = etree.SubElement(ci, f"{{{AI_NS}}}program", name="AccessIndicators")
222
+ license_ref = etree.SubElement(ai_program, f"{{{AI_NS}}}license_ref")
223
+ license_ref.text = "https://creativecommons.org/licenses/by/4.0/"
224
+ license_ref.set("applies_to", "vor")
225
+ license_ref.set("start_date", year[:4] + "-01-01" if year else "")
226
+
227
+ # DOI
228
  if doi:
229
  doi_data = etree.SubElement(ci, "doi_data")
230
  etree.SubElement(doi_data, "doi").text = doi
231
+
232
+ # New persistent URL format
233
+ chapter_id = doi.split("/")[-1] if "/" in doi else doi
234
+ repo_code_extracted = chapter_id.split(":")[0] if ":" in chapter_id else repo_base_url.split("/")[-1]
235
  etree.SubElement(
236
  doi_data,
237
  "resource"
238
+ ).text = build_persistent_url(repo_code_extracted, chapter_id)
239
 
240
+ # Sortierung nach first_page
241
  try:
242
  page_number = int(first_page)
243
+ except Exception:
244
  page_number = 999999
245
 
246
  return ci, page_number
247
 
248
 
249
+ def build_doi_batch_xml(
250
+ book_meta: dict,
251
+ depositor_meta: dict,
252
+ chapter_items: list[tuple[etree._Element, int]],
253
+ book_type: str = "edited_book",
254
+ ) -> bytes:
255
+ """
256
+ Erzeugt Crossref-<doi_batch>.
257
+ book_type: 'edited_book', 'monograph', oder 'report-paper' (custom internal flag).
258
+ """
259
+ doi_batch = etree.Element(
260
+ "doi_batch",
261
+ nsmap={
262
+ None: CROSSREF_NS,
263
+ "xsi": XSI_NS,
264
+ "jats": JATS_NS,
265
+ "ai": AI_NS,
266
+ }
267
+ )
268
+ doi_batch.set("version", "4.4.2")
269
+ doi_batch.set(
270
+ f"{{{XSI_NS}}}schemaLocation",
271
+ "http://www.crossref.org/schema/4.4.2 "
272
+ "http://www.crossref.org/schema/deposit/crossref4.4.2.xsd"
273
  )
274
 
275
+ # HEAD
276
+ head = etree.SubElement(doi_batch, "head")
277
+ etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
278
+
279
+ ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
280
+ etree.SubElement(head, "timestamp").text = ts
281
+
282
+ depositor = etree.SubElement(head, "depositor")
283
+ etree.SubElement(depositor, "depositor_name").text = depositor_meta["depositor_name"]
284
+ etree.SubElement(depositor, "email_address").text = depositor_meta["depositor_email"]
285
+
286
+ etree.SubElement(head, "registrant").text = depositor_meta["registrant"]
287
+
288
+ # BODY
289
+ body = etree.SubElement(doi_batch, "body")
290
+
291
+ # Determine structure based on book_type
292
+ if book_type == "report-paper":
293
+ report_paper = etree.SubElement(body, "report-paper")
294
+ # Decide between report-paper_metadata and report-paper_series_metadata
295
+ has_series = (book_meta.get("series_title") or book_meta.get("series_issn"))
296
+ if has_series:
297
+ metadata_root = etree.SubElement(report_paper, "report-paper_series_metadata")
298
+ # 1. SERIES METADATA (Required if using report-paper_series_metadata)
299
+ series_metadata = etree.SubElement(metadata_root, "series_metadata")
300
+ if book_meta.get("series_title"):
301
+ stitles = etree.SubElement(series_metadata, "titles")
302
+ etree.SubElement(stitles, "title").text = book_meta["series_title"]
303
+ if book_meta.get("series_issn"):
304
+ etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
305
+ else:
306
+ metadata_root = etree.SubElement(report_paper, "report-paper_metadata")
307
+ else:
308
+ # BOOK STRUCTURE (Edited Book or Monograph)
309
+ book = etree.SubElement(body, "book", book_type=book_type)
310
+ # If it's a monograph or edited book, we often use book_series_metadata or book_metadata
311
+ # For simplicity and to match the schema, let's stick to book_series_metadata if series exists
312
+ if book_meta.get("series_title") or book_meta.get("series_issn"):
313
+ metadata_root = etree.SubElement(book, "book_series_metadata")
314
+ series_metadata = etree.SubElement(metadata_root, "series_metadata")
315
+ if book_meta.get("series_title"):
316
+ stitles = etree.SubElement(series_metadata, "titles")
317
+ etree.SubElement(stitles, "title").text = book_meta["series_title"]
318
+ if book_meta.get("series_issn"):
319
+ etree.SubElement(series_metadata, "issn").text = book_meta["series_issn"]
320
+ else:
321
+ metadata_root = etree.SubElement(book, "book_metadata")
322
+
323
+ # 2. CONTRIBUTORS
324
+ # Contributors (Editors or Authors)
325
+ contributors_list = []
326
+ role = "editor"
327
+ if book_type in ["monograph", "report-paper"]:
328
+ contributors_list = book_meta.get("authors", [])
329
+ role = "author"
330
+ else:
331
+ contributors_list = book_meta.get("editors", [])
332
+ role = "editor"
333
+
334
+ if contributors_list:
335
+ contribs = etree.SubElement(metadata_root, "contributors")
336
+ for idx, person in enumerate(contributors_list):
337
+ pn = etree.SubElement(
338
+ contribs,
339
+ "person_name",
340
+ sequence="first" if idx == 0 else "additional",
341
+ contributor_role=role
342
+ )
343
+ etree.SubElement(pn, "given_name").text = person["given"]
344
+ etree.SubElement(pn, "surname").text = person["family"]
345
+
346
+ # 3. TITLES
347
+ titles = etree.SubElement(metadata_root, "titles")
348
+ etree.SubElement(titles, "title").text = book_meta["book_title"]
349
+
350
+ # 4. PUBLICATION DATE
351
+ pub = etree.SubElement(metadata_root, "publication_date", media_type="online")
352
+ if book_meta.get("pub_month") and book_meta.get("pub_month").strip():
353
+ try:
354
+ etree.SubElement(pub, "month").text = f"{int(book_meta['pub_month']):02d}"
355
+ except ValueError:
356
+ pass
357
+ if book_meta.get("pub_day") and book_meta.get("pub_day").strip():
358
+ try:
359
+ etree.SubElement(pub, "day").text = f"{int(book_meta['pub_day']):02d}"
360
+ except ValueError:
361
+ pass
362
+ etree.SubElement(pub, "year").text = str(book_meta["pub_year"])
363
+
364
+ # 5. NOISBN (only for books)
365
+ if book_type != "report-paper":
366
+ if book_meta.get("noisbn_reason"):
367
+ etree.SubElement(metadata_root, "noisbn", reason=book_meta["noisbn_reason"])
368
+
369
+ # 6. PUBLISHER
370
+ pub_node = etree.SubElement(metadata_root, "publisher")
371
+ etree.SubElement(pub_node, "publisher_name").text = book_meta["publisher_name"]
372
+
373
+ # 7. PUBLISHER ITEM (Report Number) - Only for report-paper
374
+ if book_type == "report-paper" and book_meta.get("report_number"):
375
+ publisher_item = etree.SubElement(metadata_root, "publisher_item")
376
+ etree.SubElement(publisher_item, "identifier", id_type="report-number").text = book_meta["report_number"]
377
+
378
+ # 8. DOI DATA
379
+ if book_meta.get("book_doi") or book_meta.get("book_resource"):
380
+ doi_data = etree.SubElement(metadata_root, "doi_data")
381
+ if book_meta.get("book_doi"):
382
+ etree.SubElement(doi_data, "doi").text = book_meta["book_doi"]
383
+ if book_meta.get("book_resource"):
384
+ etree.SubElement(doi_data, "resource").text = book_meta["book_resource"]
385
+
386
+ # 10. COMPONENTS (Chapters)
387
+ # Sort and append chapters
388
+ if book_type != "report-paper":
389
+ # For books, chapters are children of <book> node
390
+ # But wait, in the loop below we append to 'book' variable.
391
+ # 'book' variable is only defined if book_type != 'report-paper'.
392
+ pass
393
+
394
+ chapter_items.sort(key=lambda x: x[1])
395
+ for ci, _page in chapter_items:
396
+ if book_type == "report-paper":
397
+ report_paper.append(ci)
398
+ else:
399
+ book.append(ci)
400
+
401
  xml_bytes = etree.tostring(
402
+ doi_batch,
403
  pretty_print=True,
404
  encoding="UTF-8",
405
  xml_declaration=True
406
  )
407
+ return xml_bytes
408
+
409
+
410
+ class CrossrefSchemaResolver(etree.Resolver):
411
+ """Custom resolver to fetch included XSD schemas from Crossref and W3C."""
412
+
413
+ def resolve(self, url, id, context):
414
+ # Map of known schema locations
415
+ schema_map = {
416
+ 'mathml3-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-content.xsd',
417
+ 'mathml3-presentation.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-presentation.xsd',
418
+ 'mathml3-strict-content.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-strict-content.xsd',
419
+ 'mathml3-common.xsd': 'http://www.w3.org/Math/XMLSchema/mathml3/mathml3-common.xsd',
420
+ }
421
+
422
+ # Determine the URL to fetch
423
+ if url.startswith("http://") or url.startswith("https://"):
424
+ schema_url = url
425
+ elif url in schema_map:
426
+ schema_url = schema_map[url]
427
+ else:
428
+ schema_url = f"https://www.crossref.org/schemas/{url}"
429
+
430
+ try:
431
+ response = requests.get(schema_url, timeout=15)
432
+ response.raise_for_status()
433
+ return self.resolve_string(response.content, context)
434
+ except Exception:
435
+ # If fetching fails, return None to use default behavior
436
+ return None
437
+
438
+
439
+ def validate_crossref_xml(xml_bytes: bytes) -> tuple[bool, list[str]]:
440
+ """
441
+ Validiert Crossref XML gegen das offizielle XSD Schema.
442
+
443
+ Returns:
444
+ tuple: (is_valid, error_messages)
445
+ """
446
+ errors = []
447
+
448
+ try:
449
+ # Parse XML
450
+ doc = etree.fromstring(xml_bytes)
451
 
452
+ # Crossref XSD Schema URL
453
+ schema_url = "https://www.crossref.org/schemas/crossref4.4.2.xsd"
454
 
455
+ # Download schema (mit Caching in Session State für Performance)
456
+ if 'crossref_schema' not in st.session_state:
457
+ try:
458
+ # Create parser with custom resolver
459
+ parser = etree.XMLParser()
460
+ parser.resolvers.add(CrossrefSchemaResolver())
461
+
462
+ # Download main schema
463
+ schema_resp = requests.get(schema_url, timeout=30)
464
+ schema_resp.raise_for_status()
465
+
466
+ # Parse schema with resolver
467
+ schema_doc = etree.fromstring(schema_resp.content, parser)
468
+ st.session_state.crossref_schema = etree.XMLSchema(schema_doc)
469
+ except Exception as e:
470
+ errors.append(f"Fehler beim Laden des XSD Schemas: {e}")
471
+ return False, errors
472
+
473
+ schema = st.session_state.crossref_schema
474
+
475
+ # Validierung
476
+ is_valid = schema.validate(doc)
477
+
478
+ if not is_valid:
479
+ for error in schema.error_log:
480
+ errors.append(f"Zeile {error.line}: {error.message}")
481
+
482
+ return is_valid, errors
483
+
484
+ except etree.XMLSyntaxError as e:
485
+ errors.append(f"XML Syntax Fehler: {e}")
486
+ return False, errors
487
+ except Exception as e:
488
+ errors.append(f"Unerwarteter Fehler: {e}")
489
+ return False, errors
490
 
491
 
492
  # =====================================================================
493
+ # REPOSITORY CONFIGURATION
494
  # =====================================================================
495
 
496
+ REPO_CONFIG = {
497
+ "wsl": {
498
+ "publisher": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
499
+ "registrant": "Swiss Federal Institute for Forest, Snow and Landscape Research, WSL",
500
+ "prefix": "10.55419",
501
+ "role": "wslx"
502
+ },
503
+ "psi": {
504
+ "publisher": "Paul Scherrer Institute, PSI",
505
+ "registrant": "Paul Scherrer Institute, PSI",
506
+ "prefix": "10.55402",
507
+ "role": "psit"
508
+ },
509
+ "empa": {
510
+ "publisher": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
511
+ "registrant": "Swiss Federal Laboratories for Materials Science and Technology, Empa",
512
+ "prefix": "10.55368",
513
+ "role": "empa"
514
+ },
515
+ "eawag": {
516
+ "publisher": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
517
+ "registrant": "Swiss Federal Institute of Aquatic Science and Technology, Eawag",
518
+ "prefix": "10.55408",
519
+ "role": "eawa"
520
+ }
521
+ }
522
+
523
+ def main():
524
+ st.title("Crossref XML Generator/Uploader")
525
+
526
+ st.markdown(
527
+ "Dieses Dashboard lädt **MODS-Metadaten direkt aus DORA** mittels IDs "
528
+ "und erzeugt ein vollständiges Crossref-XML (`doi_batch`) für Reports (WSL Berichte und PSI Berichte) und Edited Books/Conference Proceedings."
529
+ )
530
+
531
+ st.subheader("Konfiguration & Quelle")
532
+
533
+ col_config, col_source = st.columns(2)
534
+
535
+ with col_config:
536
+ st.markdown("#### Verbindung & Typ")
537
+ base_url = st.text_input(
538
+ "DORA Basis-URL",
539
+ value="https://admin.dora.lib4ri.ch"
540
+ )
541
+
542
+ repo_list = list(REPO_CONFIG.keys())
543
+ repo_code = st.selectbox(
544
+ "Repository-Code",
545
+ options=repo_list,
546
+ index=0,
547
+ format_func=lambda x: x.upper()
548
+ )
549
+
550
+ repo_config = REPO_CONFIG[repo_code]
551
+ repo_base_url = f"{base_url.rstrip('/')}/{repo_code}"
552
+
553
+ pub_type = st.radio(
554
+ "Publikationstyp",
555
+ ("Edited Book", "Report (WSL, Monograph Series)", "Report (Eawag, PSI, Paper Series)"),
556
+ horizontal=False
557
+ )
558
+
559
+ # Mapping auf Crossref book_type / report type
560
+ cr_book_type = "edited_book"
561
+ if "Monograph" in pub_type:
562
+ cr_book_type = "monograph"
563
+ elif "Paper Series" in pub_type:
564
+ cr_book_type = "report-paper"
565
+
566
+ with col_source:
567
+ st.markdown("#### MODS-Quelle")
568
+ # Dynamic default ID based on repo
569
+ default_id = "41891"
570
+ if repo_code == "psi":
571
+ default_id = "84057"
572
+
573
+ book_id_or_url = st.text_input(
574
+ "DORA-ID oder MODS-URL",
575
+ value=f"{repo_code}:{default_id}",
576
+ help="Beispiel: wsl:41900 oder komplette URL"
577
+ )
578
+
579
+ st.write("") # Spacer
580
+ if st.button("Metadaten laden", type="primary"):
581
+ try:
582
+ mods_url = build_dora_mods_url(base_url, repo_code, book_id_or_url)
583
+ st.info(f"Lade MODS von: {mods_url}")
584
+ book_root = fetch_mods_xml(mods_url)
585
+ meta = parse_book_mods(book_root, repo_base_url)
586
+
587
+ # --- Attempt to extract report number from MODS ---
588
+ ns = book_root.nsmap.copy()
589
+ if "mods" not in ns:
590
+ ns["mods"] = MODS_NS
591
+ report_num = get_text(book_root, ".//mods:identifier[@type='report number']", ns)
592
+ if not report_num:
593
+ report_num = get_text(book_root, ".//mods:identifier[@type='report-number']", ns)
594
+
595
+ if not report_num:
596
+ # Check <note type="report number">
597
+ report_num = get_text(book_root, ".//mods:note[@type='report number']", ns)
598
+
599
+ if report_num:
600
+ meta["report_number"] = report_num
601
+ st.info(f"Report Number gefunden: {report_num}")
602
+ # --------------------------------------------------
603
+
604
+ # Update flat fields in session state for widgets
605
+ for k, v in meta.items():
606
+ if k in ["book_title", "series_title", "series_issn", "publisher_name",
607
+ "pub_year", "pub_month", "pub_day", "noisbn_reason",
608
+ "book_doi", "book_resource", "report_number"]:
609
+ st.session_state[k] = v
610
+ st.session_state.book_meta[k] = v
611
+
612
+ # Special handling for persons text area
613
+ if cr_book_type in ["monograph", "report-paper"]:
614
+ current_list = meta.get("authors", [])
615
+ else:
616
+ current_list = meta.get("editors", [])
617
+ st.session_state["persons_input"] = "\n".join(f"{e['given']};{e['family']}" for e in current_list)
618
+
619
+ st.session_state.book_meta_loaded = True
620
+ st.success("Metadaten erfolgreich geladen.")
621
+ st.rerun()
622
+ except Exception as e:
623
+ st.error(f"Fehler beim Laden der MODS: {e}")
624
+ import traceback
625
+ st.text(traceback.format_exc())
626
+
627
+ # Session State Init Logic (unchanged but placed after UI definition for clarity in reading flow, strictly it runs before inputs generally)
628
+ if "book_meta_loaded" not in st.session_state:
629
+ st.session_state.book_meta_loaded = False
630
+
631
+ # Current date for defaults
632
+ today = datetime.date.today()
633
+
634
+ # Initialize session state keys for widgets if not present
635
+ if "book_title" not in st.session_state:
636
+ st.session_state.book_title = ""
637
+ if "series_title" not in st.session_state:
638
+ st.session_state.series_title = ""
639
+ if "series_issn" not in st.session_state:
640
+ st.session_state.series_issn = ""
641
+ if "publisher_name" not in st.session_state:
642
+ st.session_state.publisher_name = repo_config["publisher"]
643
+ if "pub_year" not in st.session_state:
644
+ st.session_state.pub_year = today.year
645
+ if "pub_month" not in st.session_state:
646
+ st.session_state.pub_month = str(today.month)
647
+ if "pub_day" not in st.session_state:
648
+ st.session_state.pub_day = str(today.day)
649
+ if "noisbn_reason" not in st.session_state:
650
+ st.session_state.noisbn_reason = ""
651
+ if "book_doi" not in st.session_state:
652
+ st.session_state.book_doi = ""
653
+ if "book_resource" not in st.session_state:
654
+ st.session_state.book_resource = ""
655
+ if "report_number" not in st.session_state:
656
+ st.session_state.report_number = ""
657
+ if "persons_input" not in st.session_state:
658
+ st.session_state.persons_input = ""
659
+
660
+ if "book_meta" not in st.session_state:
661
+ st.session_state.book_meta = {
662
+ "book_title": "",
663
+ "series_title": "",
664
+ "series_issn": "",
665
+ "publisher_name": repo_config["publisher"],
666
+ "pub_year": today.year,
667
+ "pub_month": str(today.month),
668
+ "pub_day": str(today.day),
669
+ "noisbn_reason": "",
670
+ "book_doi": "",
671
+ "book_resource": "",
672
+ "report_number": "",
673
+ "editors": [],
674
+ "authors": [],
675
+ }
676
+
677
+ # CHECK: has the repo code changed since last run?
678
+ if "last_repo_code" not in st.session_state:
679
+ st.session_state.last_repo_code = repo_code
680
+ st.session_state.registrant = repo_config["registrant"]
681
+ st.session_state.cr_role = repo_config.get("role", "")
682
+
683
+ if st.session_state.last_repo_code != repo_code:
684
+ # Repo changed! Update defaults
685
+ st.session_state.publisher_name = repo_config["publisher"]
686
+ st.session_state.book_meta["publisher_name"] = repo_config["publisher"]
687
+ st.session_state.registrant = repo_config["registrant"]
688
+
689
+ # If the user hasn't typed anything yet or if we force update?
690
+ # Let's force update the role in session state so the input widget picks it up
691
+ st.session_state.cr_role = repo_config.get("role", "")
692
+
693
+ st.session_state.last_repo_code = repo_code
694
+
695
+ st.markdown("---")
696
+ st.subheader("Metadaten & Inhalte")
697
+
698
+ # Use expander for metadata editing to keep UI clean
699
+ with st.expander("Metadaten bearbeiten", expanded=True):
700
+ bm = st.session_state.book_meta
701
+
702
+ col_b1, col_b2 = st.columns(2)
703
+ with col_b1:
704
+ st.text_input("Titel", key="book_title")
705
+ st.text_input("Serientitel", key="series_title")
706
+ st.text_input("Serien-ISSN", key="series_issn")
707
+ st.text_input("Publisher Name", key="publisher_name")
708
+
709
+ if cr_book_type == "report-paper":
710
+ st.text_input("Report Number", key="report_number")
711
+
712
+ with col_b2:
713
+ c_y, c_m, c_d = st.columns(3)
714
+ with c_y:
715
+ st.number_input("Jahr", min_value=1900, max_value=2100, key="pub_year")
716
+ with c_m:
717
+ st.text_input("Monat", key="pub_month")
718
+ with c_d:
719
+ st.text_input("Tag", key="pub_day")
720
+
721
+ if cr_book_type != "report-paper":
722
+ st.text_input("noisbn reason", key="noisbn_reason")
723
+
724
+ st.markdown("##### Identifikatoren")
725
+ col_id1, col_id2 = st.columns(2)
726
+ with col_id1:
727
+ st.text_input("DOI", key="book_doi")
728
+ with col_id2:
729
+ st.text_input("Resource URL", key="book_resource")
730
+
731
+ st.caption(f"Basis DOI Prefix: {repo_config['prefix']}")
732
+
733
+ st.markdown("##### Mitwirkende")
734
+ # Decide label based on type
735
+ if cr_book_type in ["monograph", "report-paper"]:
736
+ st.info("Bitte **Autoren** eintragen (Vorname;Nachname).")
737
+ label = "Autoren"
738
+ else:
739
+ st.info("Bitte **Editoren** eintragen (Vorname;Nachname).")
740
+ label = "Editoren"
741
+
742
+ persons_text = st.text_area(label, key="persons_input", height=100)
743
+
744
+ # Parse and save back
745
+ new_persons = []
746
+ for line in persons_text.splitlines():
747
+ line = line.strip()
748
+ if not line:
749
+ continue
750
+ parts = [p.strip() for p in line.split(";")]
751
+ if len(parts) == 2:
752
+ new_persons.append({"given": parts[0], "family": parts[1]})
753
+
754
+ if cr_book_type in ["monograph", "report-paper"]:
755
+ bm["authors"] = new_persons
756
+ else:
757
+ bm["editors"] = new_persons
758
+
759
+ st.markdown("---")
760
+ st.subheader("Depositor & Batch Info")
761
+
762
+ with st.expander("Depositor Details", expanded=False):
763
+ col_d1, col_d2 = st.columns(2)
764
+ with col_d1:
765
+ depositor_name = st.text_input(
766
+ "Depositor Name",
767
+ value="Lib4RI - Library for the Research Institutes within the ETH Domain: Eawag, Empa, PSI & WSL"
768
+ )
769
+ with col_d2:
770
+ depositor_email = st.text_input("Depositor Email", value="dora@lib4ri.ch")
771
+
772
+ ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
773
+
774
+ batch_prefix = "book"
775
+ if cr_book_type == "report-paper":
776
+ batch_prefix = "report"
777
+ elif cr_book_type == "monograph":
778
+ batch_prefix = "monograph"
779
+
780
+ doi_batch_id = st.text_input(
781
+ "DOI Batch ID",
782
+ value=f"{batch_prefix}_{ts}",
783
+ help="Wird im XML-Header verwendet. Sollte eindeutig sein."
784
+ )
785
+
786
+ if "registrant" not in st.session_state:
787
+ st.session_state.registrant = repo_config["registrant"]
788
+
789
+ registrant = st.text_input("Registrant", value=st.session_state.registrant)
790
+ st.session_state.registrant = registrant
791
+
792
+ depositor_meta = {
793
+ "depositor_name": depositor_name,
794
+ "depositor_email": depositor_email,
795
+ "registrant": st.session_state.registrant,
796
+ "doi_batch_id": doi_batch_id
797
+ }
798
+
799
+ st.subheader("Kapitel / Inhalte")
800
+ st.caption("Ein Eintrag pro Zeile: ID (z.B. wsl:12345) oder URL")
801
+
802
+ st.markdown(
803
+ "Gib **eine DORA-ID** (z.B. `wsl:41900`) oder eine **komplette MODS-URL** "
804
+ "pro Zeile ein."
805
+ )
806
+
807
+ chapters_text = st.text_area("Kapitel-Liste", height=200, help="Liste der IDs oder URLs")
808
+
809
+ st.markdown("---")
810
+ st.subheader("XML Generierung")
811
+
812
+ if st.button("Crossref XML generieren", type="primary"):
813
+ try:
814
+ chapter_items = []
815
+
816
+ for line in chapters_text.splitlines():
817
+ line = line.strip()
818
+ if not line:
819
+ continue
820
+ mods_url = build_dora_mods_url(base_url, repo_code, line)
821
+ st.write(f"Lade Kapitel-MODS von: {mods_url}")
822
+ mods_root = fetch_mods_xml(mods_url)
823
+ ci, page_no = mods_to_content_item(mods_root, repo_base_url)
824
+ chapter_items.append((ci, page_no))
825
+
826
+ if not chapter_items and cr_book_type == "edited_book":
827
+ st.warning("Keine Kapitel angegeben! Ein Edited Book sollte normalerweise Kapitel enthalten.")
828
+
829
+ # book_meta aus session state / widgets zusammenbauen
830
+ book_meta = {
831
+ "book_title": st.session_state.book_title,
832
+ "series_title": st.session_state.series_title,
833
+ "series_issn": st.session_state.series_issn,
834
+ "publisher_name": st.session_state.publisher_name,
835
+ "pub_year": int(st.session_state.pub_year) if st.session_state.get("pub_year") else 0,
836
+ "pub_month": st.session_state.pub_month,
837
+ "pub_day": st.session_state.pub_day,
838
+ "noisbn_reason": st.session_state.get("noisbn_reason", ""),
839
+ "book_doi": st.session_state.book_doi,
840
+ "book_resource": st.session_state.book_resource,
841
+ "report_number": st.session_state.get("report_number", ""),
842
+ "editors": new_persons if cr_book_type not in ["monograph", "report-paper"] else [],
843
+ "authors": new_persons if cr_book_type in ["monograph", "report-paper"] else [],
844
+ }
845
+
846
+ xml_bytes = build_doi_batch_xml(book_meta, depositor_meta, chapter_items, book_type=cr_book_type)
847
+
848
+ # Store in session state
849
+ st.session_state.crossref_xml = xml_bytes
850
+ st.session_state.crossref_filename = "crossref_edited_book.xml"
851
+
852
+ st.success("Crossref XML erfolgreich erzeugt!")
853
+
854
+ # Validierung gegen Crossref XSD Schema
855
+ st.subheader("XML Validierung")
856
+ with st.spinner("Validiere XML gegen Crossref Schema..."):
857
+ is_valid, validation_errors = validate_crossref_xml(xml_bytes)
858
+
859
+ if is_valid:
860
+ st.success("✓ XML ist valide und bereit für Crossref!")
861
+ else:
862
+ st.error("✗ XML Validierung fehlgeschlagen:")
863
+ for error in validation_errors:
864
+ st.error(f" • {error}")
865
+ st.warning("Das XML kann trotzdem heruntergeladen werden, wird aber möglicherweise von Crossref abgelehnt.")
866
+
867
+ except Exception as e:
868
+ st.error(f"Fehler bei der Erzeugung des XML: {e}")
869
+ import traceback
870
+ st.text(traceback.format_exc())
871
+
872
+ # Display Download and Upload if XML exists in session state
873
+ if "crossref_xml" in st.session_state:
874
+ xml_bytes = st.session_state.crossref_xml
875
+
876
+ # Download Button
877
+ st.download_button(
878
+ label="XML herunterladen",
879
+ data=xml_bytes,
880
+ file_name=st.session_state.crossref_filename,
881
+ mime="application/xml"
882
+ )
883
+
884
+ # ---------------------------------------------------------
885
+ # Crossref Upload Section
886
+ # ---------------------------------------------------------
887
+ st.markdown("---")
888
+ st.subheader("Automatischer Upload zu Crossref")
889
+
890
+ # Determine default role if not in session state
891
+ if "cr_role" not in st.session_state:
892
+ st.session_state.cr_role = REPO_CONFIG.get(st.session_state.last_repo_code, {}).get("role", "")
893
+
894
+ col_u1, col_u2 = st.columns(2)
895
+ with col_u1:
896
+ cr_user = st.text_input("Crossref Username", value="dora@lib4ri.ch")
897
+ # Use key to bind to session state
898
+ cr_role = st.text_input("Crossref Role (wslx, empa, eawa, psit)", key="cr_role")
899
+ with col_u2:
900
+ cr_pass = st.text_input("Crossref Password", type="password")
901
+
902
+ if st.button("Upload to Crossref"):
903
+ if not cr_user or not cr_pass:
904
+ st.error("Bitte Username und Passwort für Crossref angeben.")
905
+ else:
906
+ with st.spinner("Lade zu Crossref hoch..."):
907
+ res = upload_to_crossref(xml_bytes, cr_user, cr_pass, cr_role)
908
+
909
+ if isinstance(res, str) and res.startswith("Exception"):
910
+ st.error(f"Upload fehlgeschlagen: {res}")
911
+ else:
912
+ # Crossref returns 200 even on some logic errors, text contains details
913
+ if res.status_code == 200:
914
+ if "successfully received" in res.text:
915
+ st.success("Upload erfolgreich! Crossref hat die Datei empfangen.")
916
+ with st.expander("Server-Antwort ansehen"):
917
+ st.text(res.text)
918
+ else:
919
+ st.warning("Upload technisch erfolgreich (HTTP 200), aber Crossref meldet eventuell Fehler.")
920
+ with st.expander("Server-Antwort ansehen (Fehleranalyse)"):
921
+ st.text(res.text)
922
+ else:
923
+ st.error(f"HTTP Fehler: {res.status_code}")
924
+ st.text(res.text)
925
+
926
+
927
+ def upload_to_crossref(xml_content, username, password, role=None):
928
+ url = "https://doi.crossref.org/servlet/deposit"
929
+
930
+ # Construct login_id with role if provided (format: username/role)
931
+ login_id = username
932
+ if role and role.strip():
933
+ login_id = f"{username}/{role.strip()}"
934
+
935
+ # Multipart form data
936
+ # 'operation': 'doMDUpload'
937
+ # 'login_id': username (or username/role)
938
+ # 'login_passwd': password
939
+ # 'fname': (filename, file_content, content_type)
940
+
941
+ files = {
942
+ 'fname': ('crossref_submission.xml', xml_content, 'application/xml')
943
+ }
944
+ data = {
945
+ 'operation': 'doMDUpload',
946
+ 'login_id': login_id,
947
+ 'login_passwd': password
948
+ }
949
+
950
+ try:
951
+ response = requests.post(url, files=files, data=data, timeout=60)
952
+ return response
953
+ except Exception as e:
954
+ return f"Exception: {e}"
955
+
956
  if __name__ == "__main__":
957
+ main()