NurseCitizenDeveloper commited on
Commit
2bd80fd
Β·
verified Β·
1 Parent(s): 5e159ec

Upload streamlit_app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. streamlit_app.py +48 -4
streamlit_app.py CHANGED
@@ -20,6 +20,8 @@ try:
20
  except ImportError:
21
  _PDF_AVAILABLE = False
22
 
 
 
23
  sys.path.insert(0, os.path.dirname(__file__))
24
  from wiki.starter import get_starter_wiki
25
  from core.compiler import compile_source, rebuild_index
@@ -143,6 +145,14 @@ def add_or_update_article(article: dict):
143
  wiki["metadata"]["article_count"] = len(wiki["articles"])
144
 
145
 
 
 
 
 
 
 
 
 
146
  def extract_pdf_text(file_bytes: bytes) -> tuple[str, int]:
147
  """Extract all text from a PDF. Returns (text, page_count)."""
148
  reader = PdfReader(io.BytesIO(file_bytes))
@@ -416,20 +426,51 @@ Large PDFs (100+ pages) are supported; text is extracted from every page automat
416
  src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 β€” Sepsis (2016)")
417
  src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
418
 
419
- input_method = st.radio("Input method", ["Upload PDF", "Paste text"], horizontal=True)
 
 
 
 
 
420
 
421
  src_content = ""
422
  pdf_meta = None
423
 
424
- if input_method == "Upload PDF":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  if not _PDF_AVAILABLE:
426
  st.error("pypdf not installed β€” PDF upload unavailable.")
427
  else:
 
428
  uploaded_pdf = st.file_uploader(
429
- "Upload PDF (up to 500 MB)",
430
  type=["pdf"],
431
  key="pdf_upload",
432
- help="Text is extracted from every page. Large documents are fully supported.",
433
  )
434
  if uploaded_pdf is not None:
435
  with st.spinner(f"Extracting text from {uploaded_pdf.name}..."):
@@ -445,6 +486,7 @@ Large PDFs (100+ pages) are supported; text is extracted from every page automat
445
  st.error(f"PDF extraction failed: {e}")
446
  if not src_title and uploaded_pdf:
447
  src_title = uploaded_pdf.name.replace(".pdf", "").replace("_", " ")
 
448
  else:
449
  src_content = st.text_area(
450
  "Paste text here",
@@ -468,6 +510,8 @@ Large PDFs (100+ pages) are supported; text is extracted from every page automat
468
  entry["pdf_size_kb"] = pdf_meta["size_kb"]
469
  wiki["sources"][src_id] = entry
470
  log(f"ingest | Added source: {src_title} ({len(src_content):,} chars)")
 
 
471
  st.success(f"Source added: **{src_title}**")
472
  st.rerun()
473
 
 
20
  except ImportError:
21
  _PDF_AVAILABLE = False
22
 
23
+ import requests as _requests
24
+
25
  sys.path.insert(0, os.path.dirname(__file__))
26
  from wiki.starter import get_starter_wiki
27
  from core.compiler import compile_source, rebuild_index
 
145
  wiki["metadata"]["article_count"] = len(wiki["articles"])
146
 
147
 
148
+ def fetch_pdf_from_url(url: str, timeout: int = 60) -> bytes:
149
+ """Fetch a PDF from a URL server-side (bypasses HF proxy upload limits)."""
150
+ headers = {"User-Agent": "NursingKnowledgeBase/1.0 (nursing education tool)"}
151
+ resp = _requests.get(url, headers=headers, timeout=timeout, stream=True)
152
+ resp.raise_for_status()
153
+ return resp.content
154
+
155
+
156
  def extract_pdf_text(file_bytes: bytes) -> tuple[str, int]:
157
  """Extract all text from a PDF. Returns (text, page_count)."""
158
  reader = PdfReader(io.BytesIO(file_bytes))
 
426
  src_title = st.text_input("Source title", placeholder="e.g. NICE NG51 β€” Sepsis (2016)")
427
  src_type = st.selectbox("Type", ["Clinical Guideline", "Research Paper", "NMC Document", "NHS Protocol", "Textbook", "Other"])
428
 
429
+ input_method = st.radio(
430
+ "Input method",
431
+ ["PDF from URL", "Upload PDF", "Paste text"],
432
+ horizontal=True,
433
+ help="Use 'PDF from URL' for large files β€” the server fetches it directly.",
434
+ )
435
 
436
  src_content = ""
437
  pdf_meta = None
438
 
439
+ if input_method == "PDF from URL":
440
+ st.caption("Paste a direct link to any PDF β€” NICE guidelines, NMC documents, research papers, etc. The server fetches it, so there is no size limit.")
441
+ pdf_url = st.text_input(
442
+ "PDF URL",
443
+ placeholder="https://www.nice.org.uk/guidance/ng51/resources/sepsis-pdf-...",
444
+ key="pdf_url",
445
+ )
446
+ if pdf_url and st.button("Fetch & Extract", key="fetch_pdf"):
447
+ with st.spinner("Fetching PDF from URL..."):
448
+ try:
449
+ raw_bytes = fetch_pdf_from_url(pdf_url)
450
+ extracted, page_count = extract_pdf_text(raw_bytes)
451
+ src_content = extracted
452
+ pdf_meta = {"pages": page_count, "size_kb": len(raw_bytes) // 1024}
453
+ st.session_state["fetched_pdf_content"] = extracted
454
+ st.session_state["fetched_pdf_meta"] = pdf_meta
455
+ st.success(f"Fetched {page_count} pages / {len(extracted):,} characters")
456
+ with st.expander("Preview extracted text"):
457
+ st.text(extracted[:1500] + ("..." if len(extracted) > 1500 else ""))
458
+ except Exception as e:
459
+ st.error(f"Fetch failed: {e}")
460
+ # Persist fetched content across reruns
461
+ if not src_content and st.session_state.get("fetched_pdf_content"):
462
+ src_content = st.session_state["fetched_pdf_content"]
463
+ pdf_meta = st.session_state.get("fetched_pdf_meta")
464
+
465
+ elif input_method == "Upload PDF":
466
  if not _PDF_AVAILABLE:
467
  st.error("pypdf not installed β€” PDF upload unavailable.")
468
  else:
469
+ st.caption("For large PDFs (>50 MB) use 'PDF from URL' instead β€” HF Spaces limits browser uploads.")
470
  uploaded_pdf = st.file_uploader(
471
+ "Upload PDF",
472
  type=["pdf"],
473
  key="pdf_upload",
 
474
  )
475
  if uploaded_pdf is not None:
476
  with st.spinner(f"Extracting text from {uploaded_pdf.name}..."):
 
486
  st.error(f"PDF extraction failed: {e}")
487
  if not src_title and uploaded_pdf:
488
  src_title = uploaded_pdf.name.replace(".pdf", "").replace("_", " ")
489
+
490
  else:
491
  src_content = st.text_area(
492
  "Paste text here",
 
510
  entry["pdf_size_kb"] = pdf_meta["size_kb"]
511
  wiki["sources"][src_id] = entry
512
  log(f"ingest | Added source: {src_title} ({len(src_content):,} chars)")
513
+ st.session_state.pop("fetched_pdf_content", None)
514
+ st.session_state.pop("fetched_pdf_meta", None)
515
  st.success(f"Source added: **{src_title}**")
516
  st.rerun()
517