TEZv's picture
Update chatbot.py
fbb0103 verified
"""
K R&D Lab — Research Assistant (RAG Chatbot)
Author: Oksana Kolisnyk | kosatiks-group.pp.ua
Repo: github.com/TEZv/K-RnD-Lab-PHYLO-03_2026
RAG pipeline: sentence-transformers + FAISS (no API key required)
Indexed on 20 curated papers: LNP delivery, protein corona, cancer variants
Confidence flags: HIGH / MEDIUM / SPECULATIVE
Never answers outside indexed papers.
"""
import os
import json
import time
import hashlib
import datetime
import requests
import gradio as gr
import numpy as np
# ─────────────────────────────────────────────
# PAPER CORPUS — 20 curated PMIDs
# Topics: LNP/brain delivery, protein corona, cancer variants
# ─────────────────────────────────────────────
PAPER_PMIDS = [
# LNP delivery (5) — all PubMed-verified
"34394960", # Hou X — LNP mRNA delivery review (Nat Rev Mater 2021)
"32251383", # Cheng Q — SORT LNPs organ selectivity (Nat Nanotechnol 2020)
"29653760", # Sabnis S — novel amino lipid series for mRNA (Mol Ther 2018)
"22782619", # Jayaraman M — ionizable lipid siRNA LNP potency (Angew Chem 2012)
"33208369", # Rosenblum D — CRISPR-Cas9 LNP cancer therapy (Sci Adv 2020)
# Protein corona (5)
"18809927", # Lundqvist M — nanoparticle size/surface protein corona (PNAS 2008)
"22086677", # Walkey CD — nanomaterial-protein interactions (Chem Soc Rev 2012)
"31565943", # Park M — accessible surface area within nanoparticle corona (Nano Lett 2019)
"33754708", # Sebastiani F — ApoE binding drives LNP rearrangement (ACS Nano 2021)
"20461061", # Akinc A — endogenous ApoE-mediated LNP liver delivery (Mol Ther 2010)
# Cancer variants & precision oncology (5)
"30096302", # Bailey MH — cancer driver genes TCGA (Cell 2018)
"30311387", # Landrum MJ — ClinVar at five years (Hum Mutat 2018)
"32461654", # Karczewski KJ — gnomAD mutational constraint 141,456 humans (Nature 2020)
"27328919", # Bouaoun L — TP53 variations IARC database (Hum Mutat 2016)
"31820981", # Lanman BA — KRAS G12C covalent inhibitor AMG 510 (J Med Chem 2020)
# LNP immunotherapy & siRNA (3)
"28678784", # Sahin U — personalized RNA mutanome vaccines (Nature 2017)
"31348638", # Kozma GT — anti-PEG IgM complement activation (ACS Nano 2019)
"33016924", # Cafri G — mRNA neoantigen T cell immunity GI cancer (J Clin Invest 2020)
# Liquid biopsy (2)
"31142840", # Cristiano S — genome-wide cfDNA fragmentation in cancer (Nature 2019)
"33883548", # Larson MH — cell-free transcriptome tissue biomarkers (Nat Commun 2021)
]
# Curated abstracts / key content for each PMID
# Verified against PubMed esummary + efetch API — 2026-03-07
# All PMIDs confirmed real; abstracts fetched directly from NCBI
PAPER_CORPUS = [
{
"pmid": "34394960",
"title": "Lipid nanoparticles for mRNA delivery.",
"abstract": (
"Messenger RNA (mRNA) has emerged as a new category of therapeutic agent to prevent and treat "
"various diseases. To function in vivo, mRNA requires safe, effective and stable delivery "
"systems that protect the nucleic acid from degradation and that allow cellular uptake and "
"mRNA release. Lipid nanoparticles have successfully entered the clinic for the delivery of "
"mRNA; in particular, lipid nanoparticle-mRNA vaccines are now in clinical use against "
"coronavirus disease 2019 (COVID-19), which marks a milestone for mRNA therapeutics. In this "
"Review, we discuss the design of lipid nanoparticles for mRNA delivery and examine "
"physiological barriers and possible administration routes for lipid nanoparticle-mRNA "
"systems. We then consider key points for the clinical translation of lipid nanoparticle-mRNA "
"formulations, including good manufacturing practice, stability, storage and safety, and "
"highlight preclinical and clinical studies of lipid nanoparticle-mRNA therapeutics for "
"infectious diseases, cancer and genetic disorders. Finally, we give an outlook to future "
"possibilities and remaining challenges for this promising technology."
),
"journal": "Nat Rev Mater",
"year": 2021,
"topic": "LNP mRNA delivery",
},
{
"pmid": "32251383",
"title": "Selective organ targeting (SORT) nanoparticles for tissue-specific mRNA delivery and CRISPR-Cas gene editing.",
"abstract": (
"CRISPR-Cas gene editing and messenger RNA-based protein replacement therapy hold tremendous "
"potential to effectively treat disease-causing mutations with diverse cellular origin. "
"However, it is currently impossible to rationally design nanoparticles that selectively "
"target specific tissues. Here, we report a strategy termed selective organ targeting (SORT) "
"wherein multiple classes of lipid nanoparticles are systematically engineered to exclusively "
"edit extrahepatic tissues via addition of a supplemental SORT molecule. Lung-, spleen- and "
"liver-targeted SORT lipid nanoparticles were designed to selectively edit therapeutically "
"relevant cell types including epithelial cells, endothelial cells, B cells, T cells and "
"hepatocytes. SORT is compatible with multiple gene editing techniques, including mRNA, Cas9 "
"mRNA/single guide RNA and Cas9 ribonucleoprotein complexes, and is envisioned to aid the "
"development of protein replacement and gene correction therapeutics in targeted tissues."
),
"journal": "Nat Nanotechnol",
"year": 2020,
"topic": "LNP organ selectivity",
},
{
"pmid": "29653760",
"title": "A Novel Amino Lipid Series for mRNA Delivery: Improved Endosomal Escape and Sustained Pharmacology and Safety in Non-human Primates.",
"abstract": (
"The success of mRNA-based therapies depends on the availability of a safe and efficient "
"delivery vehicle. Lipid nanoparticles have been identified as a viable option. However, "
"there are concerns whether an acceptable tolerability profile for chronic dosing can be "
"achieved. The efficiency and tolerability of lipid nanoparticles has been attributed to the "
"amino lipid. Therefore, we developed a new series of amino lipids that address this concern. "
"Clear structure-activity relationships were developed that resulted in a new amino lipid "
"that affords efficient mRNA delivery in rodent and primate models with optimal "
"pharmacokinetics. A 1-month toxicology evaluation in rat and non-human primate demonstrated "
"no adverse events with the new lipid nanoparticle system. Mechanistic studies demonstrate "
"that the improved efficiency can be attributed to increased endosomal escape. This effort "
"has resulted in the first example of the ability to safely repeat dose mRNA-containing lipid "
"nanoparticles in non-human primate at therapeutically relevant levels."
),
"journal": "Mol Ther",
"year": 2018,
"topic": "LNP ionizable lipid",
},
{
"pmid": "22782619",
"title": "Maximizing the potency of siRNA lipid nanoparticles for hepatic gene silencing in vivo.",
"abstract": (
"Special (lipid) delivery: The role of the ionizable lipid pK(a) in the in vivo delivery of "
"siRNA by lipid nanoparticles has been studied with a large number of head group "
"modifications to the lipids. A tight correlation between the lipid pK(a) value and silencing "
"of the mouse FVII gene (FVII ED(50) ) was found, with an optimal pK(a) range of 6.2-6.5. The "
"most potent cationic lipid from this study has ED(50) levels around 0.005 mg kg(-1) in mice "
"and less than 0.03 mg kg(-1) in non-human primates."
),
"journal": "Angew Chem Int Ed Engl",
"year": 2012,
"topic": "LNP ionizable lipid siRNA",
},
{
"pmid": "33208369",
"title": "CRISPR-Cas9 genome editing using targeted lipid nanoparticles for cancer therapy.",
"abstract": (
"Harnessing CRISPR-Cas9 technology for cancer therapeutics has been hampered by low editing "
"efficiency in tumors and potential toxicity of existing delivery systems. Here, we describe "
"a safe and efficient lipid nanoparticle (LNP) for the delivery of Cas9 mRNA and sgRNAs that "
"use a novel amino-ionizable lipid. A single intracerebral injection of CRISPR-LNPs against"
),
"journal": "Sci Adv",
"year": 2020,
"topic": "LNP cancer CRISPR",
},
{
"pmid": "18809927",
"title": "Nanoparticle size and surface properties determine the protein corona with possible implications for biological impacts.",
"abstract": (
"Nanoparticles in a biological fluid (plasma, or otherwise) associate with a range of "
"biopolymers, especially proteins, organized into the \"protein corona\" that is associated "
"with the nanoparticle and continuously exchanging with the proteins in the environment. "
"Methodologies to determine the corona and to understand its dependence on nanomaterial "
"properties are likely to become important in bionanoscience. Here, we study the long-lived "
"(\"hard\") protein corona formed from human plasma for a range of nanoparticles that differ "
"in surface properties and size. Six different polystyrene nanoparticles were studied: three "
"different surface chemistries (plain PS, carboxyl-modified, and amine-modified) and two "
"sizes of each (50 and 100 nm), enabling us to perform systematic studies of the effect of "
"surface properties and size on the detailed protein coronas. Proteins in the corona that are "
"conserved and unique across the nanoparticle types were identified and classified according "
"to the protein functional properties. Remarkably, both size and surface properties were "
"found to play a very significant role in determining the nanoparticle coronas on the "
"different particles of identical materials"
),
"journal": "Proc Natl Acad Sci U S A",
"year": 2008,
"topic": "protein corona",
},
{
"pmid": "22086677",
"title": "Understanding and controlling the interaction of nanomaterials with proteins in a physiological environment.",
"abstract": (
"Nanomaterials hold promise as multifunctional diagnostic and therapeutic agents. However, "
"the effective application of nanomaterials is hampered by limited understanding and control "
"over their interactions with complex biological systems. When a nanomaterial enters a "
"physiological environment, it rapidly adsorbs proteins forming what is known as the protein "
"\'corona\'. The protein corona alters the size and interfacial composition of a "
"nanomaterial, giving it a biological identity that is distinct from its synthetic identity. "
"The biological identity determines the physiological response including signalling, "
"kinetics, transport, accumulation, and toxicity. The structure and composition of the "
"protein corona depends on the synthetic identity of the nanomaterial (size, shape, and "
"composition), the nature of the physiological environment (blood, interstitial fluid, cell "
"cytoplasm, etc.), and the duration of exposure. In this critical review, we discuss the "
"formation of the protein corona, its structure and composition, and its influence on the "
"physiological response. We also present an \'adsorbome\' of 125 plasma proteins that are "
"known to associate with nanomaterials. We further describe"
),
"journal": "Chem Soc Rev",
"year": 2012,
"topic": "protein corona",
},
{
"pmid": "31565943",
"title": "Measuring the Accessible Surface Area within the Nanoparticle Corona Using Molecular Probe Adsorption.",
"abstract": (
"The corona phase-the adsorbed layer of polymer, surfactant, or stabilizer molecules around a "
"nanoparticle-is typically utilized to disperse nanoparticles into a solution or solid phase. "
"However, this phase also controls molecular access to the nanoparticle surface, a property "
"important for catalytic activity and sensor applications. Unfortunately, few methods can "
"directly probe the structure of this corona phase, which is subcategorized as either a hard, "
"immobile corona or a soft, transient corona in exchange with components in the bulk "
"solution. In this work, we introduce a molecular probe adsorption (MPA) method for measuring "
"the accessible nanoparticle surface area using a titration of a quenchable fluorescent "
"molecule. For example, riboflavin is utilized to measure the surface area of gold "
"nanoparticle standards, as well as corona phases on dispersed single-walled carbon nanotubes "
"and graphene sheets. A material balance on the titration yields certain surface coverage "
"parameters, including the ratio of the surface area to dissociation constant of the "
"fluorophore,"
),
"journal": "Nano Lett",
"year": 2019,
"topic": "protein corona hard/soft",
},
{
"pmid": "33754708",
"title": "Apolipoprotein E Binding Drives Structural and Compositional Rearrangement of mRNA-Containing Lipid Nanoparticles.",
"abstract": (
"Emerging therapeutic treatments based on the production of proteins by delivering mRNA have "
"become increasingly important in recent times. While lipid nanoparticles (LNPs) are approved "
"vehicles for small interfering RNA delivery, there are still challenges to use this "
"formulation for mRNA delivery. LNPs are typically a mixture of a cationic lipid, "
"distearoylphosphatidylcholine (DSPC), cholesterol, and a PEG-lipid. The structural "
"characterization of mRNA-containing LNPs (mRNA-LNPs) is crucial for a full understanding of "
"the way in which they function, but this information alone is not enough to predict their "
"fate upon entering the bloodstream. The biodistribution and cellular uptake of LNPs are "
"affected by their surface composition as well as by the extracellular proteins present at "
"the site of LNP administration,"
),
"journal": "ACS Nano",
"year": 2021,
"topic": "ApoE LNP corona",
},
{
"pmid": "20461061",
"title": "Targeted delivery of RNAi therapeutics with endogenous and exogenous ligand-based mechanisms.",
"abstract": (
"Lipid nanoparticles (LNPs) have proven to be highly efficient carriers of short-interfering "
"RNAs (siRNAs) to hepatocytes in vivo; however, the precise mechanism by which this efficient "
"delivery occurs has yet to be elucidated. We found that apolipoprotein E (apoE), which plays "
"a major role in the clearance and hepatocellular uptake of physiological lipoproteins, also "
"acts as an endogenous targeting ligand for ionizable LNPs (iLNPs), but not cationic LNPs "
"(cLNPs). The role of apoE was investigated using both in vitro studies employing recombinant "
"apoE and in vivo studies in wild-type and apoE(-/-) mice. Receptor dependence was explored "
"in vitro and in vivo using low-density lipoprotein receptor (LDLR(-/-))-deficient mice. As "
"an alternative to endogenous apoE-based targeting, we developed a targeting approach using "
"an exogenous ligand containing a multivalent N-acetylgalactosamine (GalNAc)-cluster, which "
"binds with high affinity to the asialoglycoprotein receptor (ASGPR) expressed on "
"hepatocytes. Both apoE-based endogenous and GalNAc-based exogenous targeting appear to be "
"highly effective strategies for the delivery of iLNPs to liver."
),
"journal": "Mol Ther",
"year": 2010,
"topic": "ApoE LNP liver delivery",
},
{
"pmid": "30096302",
"title": "Comprehensive Characterization of Cancer Driver Genes and Mutations.",
"abstract": (
"[Summary — abstract not available in PubMed XML] Bailey MH et al. analyzed 9,423 tumors across 33 cancer types from TCGA to identify 299 "
"cancer driver genes using 26 computational tools. The study found that most cancers have 2-6 "
"driver gene mutations. TP53 is the most frequently mutated driver gene (42% of cancers). "
"KRAS mutations dominate in PDAC (92%), LUAD (33%), and COAD (43%). Oncogenes are "
"predominantly activated by missense mutations at hotspots; tumor suppressors are inactivated "
"by truncating mutations or deletions. The pan-cancer driver landscape varies substantially "
"across cancer types, with rare cancers often having unique driver profiles. This resource "
"provides a comprehensive reference for cancer genomics and therapeutic target "
"identification."
),
"journal": "Cell",
"year": 2018,
"topic": "cancer driver genes",
},
{
"pmid": "30311387",
"title": "ClinVar at five years: Delivering on the promise.",
"abstract": (
"The increasing application of genetic testing for determining the causes underlying "
"Mendelian, pharmacogenetic, and somatic phenotypes has accelerated the discovery of novel "
"variants by clinical genetics laboratories, resulting in a critical need for interpreting "
"the significance of these variants and presenting considerable challenges. Launched in 2013 "
"at the National Center for Biotechnology Information, National Institutes of Health, ClinVar "
"is a public database for clinical laboratories, researchers, expert panels, and others to "
"share their interpretations of variants with their evidence. The database holds 600,000 "
"submitted records from 1,000 submitters, representing 430,000 unique variants. ClinVar "
"encourages submissions of variants reviewed by expert panels, as expert consensus confers a "
"high standard. Aggregating data from many groups in a single database allows comparison of "
"interpretations, providing transparency into the concordance or discordance of "
"interpretations. In its first five years, ClinVar has successfully provided a gateway for "
"the submission of medically relevant variants and interpretations of their significance to "
"disease. It has become an invaluable resour"
),
"journal": "Hum Mutat",
"year": 2018,
"topic": "ClinVar variant classification",
},
{
"pmid": "32461654",
"title": "The mutational constraint spectrum quantified from variation in 141,456 humans.",
"abstract": (
"Genetic variants that inactivate protein-coding genes are a powerful source of information "
"about the phenotypic consequences of gene disruption: genes that are crucial for the "
"function of an organism will be depleted of such variants in natural populations, whereas "
"non-essential genes will tolerate their accumulation. However, predicted loss-of-function "
"variants are enriched for annotation errors, and tend to be found at extremely low "
"frequencies, so their analysis requires careful variant annotation and very large sample "
"sizes"
),
"journal": "Nature",
"year": 2020,
"topic": "gnomAD population variants",
},
{
"pmid": "27328919",
"title": "TP53 Variations in Human Cancers: New Lessons from the IARC TP53 Database and Genomics Data.",
"abstract": (
"TP53 gene mutations are one of the most frequent somatic events in cancer. The IARC TP53 "
"Database (http://p53.iarc.fr) is a popular resource that compiles occurrence and phenotype "
"data on TP53 germline and somatic variations linked to human cancer. The deluge of data "
"coming from cancer genomic studies generates new data on TP53 variations and attracts a "
"growing number of database users for the interpretation of TP53 variants. Here, we present "
"the current contents and functionalities of the IARC TP53 Database and perform a systematic "
"analysis of TP53 somatic mutation data extracted from this database and from genomic data "
"repositories. This analysis showed that IARC has more TP53 somatic mutation data than "
"genomic repositories (29,000 vs. 4,000). However, the more complete screening achieved by "
"genomic studies highlighted some overlooked facts about TP53 mutations, such as the presence "
"of a significant number of mutations occurring outside the DNA-binding domain in specific "
"cancer types. We also provide an update on TP53 inherited variants including the ones that "
"should be considered as neutral frequent variations. We thus provide an update of current "
"knowledge on TP53 variations in"
),
"journal": "Hum Mutat",
"year": 2016,
"topic": "TP53 mutations cancer",
},
{
"pmid": "31820981",
"title": "Discovery of a Covalent Inhibitor of KRAS(G12C) (AMG 510) for the Treatment of Solid Tumors.",
"abstract": (
"[Summary — abstract not available in PubMed XML] KRASG12C has emerged as a promising target in solid tumors. Lanman BA et al. report the "
"discovery of AMG 510 (sotorasib), a covalent inhibitor targeting the mutant cysteine-12 "
"residue of KRAS G12C. The authors exploited a cryptic pocket (H95/Y96/Q99) identified in "
"KRASG12C using structure-based design, leading to a novel quinazolinone scaffold. AMG 510 is "
"highly potent, selective, and well-tolerated. It entered phase I clinical trials "
"(NCT03600883) and subsequently received FDA approval as sotorasib (Lumakras) for KRAS "
"G12C-mutant NSCLC. This work established the first clinically viable direct KRAS inhibitor, "
"overcoming decades of the \'undruggable\' KRAS paradigm. Resistance mechanisms include "
"secondary KRAS mutations and bypass pathway activation via EGFR, MET, and RET."
),
"journal": "J Med Chem",
"year": 2020,
"topic": "KRAS G12C inhibitor",
},
{
"pmid": "28678784",
"title": "Personalized RNA mutanome vaccines mobilize poly-specific therapeutic immunity against cancer.",
"abstract": (
"T cells directed against mutant neo-epitopes drive cancer immunity. However, spontaneous "
"immune recognition of mutations is inefficient. We recently introduced the concept of "
"individualized mutanome vaccines and implemented an RNA-based poly-neo-epitope approach to "
"mobilize immunity against a spectrum of cancer mutations. Here we report the first-in-human "
"application of this concept in melanoma. We set up a process comprising comprehensive "
"identification of individual mutations, computational prediction of neo-epitopes, and design "
"and manufacturing of a vaccine unique for each patient. All patients developed T cell "
"responses against multiple vaccine neo-epitopes at up to high single-digit percentages. "
"Vaccine-induced T cell infiltration and neo-epitope-specific killing of autologous tumour "
"cells were shown in post-vaccination resected metastases from two patients. The cumulative "
"rate of metastatic events was highly significantly reduced after the start of vaccination, "
"resulting in a sustained progression-free survival. Two of the five patients with metastatic "
"disease experienced vaccine-related objective responses. One of these patients had a late "
"relapse owing to outgrowth of β2-m"
),
"journal": "Nature",
"year": 2017,
"topic": "mRNA cancer vaccine",
},
{
"pmid": "31348638",
"title": "Pseudo-anaphylaxis to Polyethylene Glycol (PEG)-Coated Liposomes: Roles of Anti-PEG IgM and Complement Activation in a Porcine Model of Human Infusion Reactions.",
"abstract": (
"Polyethylene glycol (PEG)-coated nanopharmaceuticals can cause mild to severe "
"hypersensitivity reactions (HSRs), which can occasionally be life threatening or even "
"lethal. The phenomenon represents an unsolved immune barrier to the use of these drugs, yet "
"its mechanism is poorly understood. This study showed that a single i.v. injection in pigs "
"of a low dose of PEGylated liposomes (Doxebo) induced a massive rise of anti-PEG IgM in "
"blood, peaking at days 7-9 and declining over 6 weeks. Bolus injections of PEG-liposomes "
"during seroconversion resulted in anaphylactoid shock (pseudo-anaphylaxis) within 2-3 min, "
"although similar treatments of naı̈ve animals led to only mild hemodynamic disturbance. "
"Parallel measurement of pulmonary arterial pressure (PAP) and sC5b-9 in blood, taken as "
"measures of HSR and complement activation, respectively, showed a concordant rise of the two "
"variables within 3 min and a decline within 15 min, suggesting a causal relationship between "
"complement activation and pulmonary hypertension. We also observed a rapid decline of "
"anti-PEG IgM in the blood within minutes, increased binding of PEGylated liposomes to IgM"
),
"journal": "ACS Nano",
"year": 2019,
"topic": "anti-PEG immunity LNP",
},
{
"pmid": "33016924",
"title": "mRNA vaccine-induced neoantigen-specific T cell immunity in patients with gastrointestinal cancer.",
"abstract": (
"BACKGROUNDTherapeutic vaccinations against cancer have mainly targeted differentiation "
"antigens, cancer-testis antigens, and overexpressed antigens and have thus far resulted in "
"little clinical benefit. Studies conducted by multiple groups have demonstrated that T cells "
"recognizing neoantigens are present in most cancers and offer a specific and highly "
"immunogenic target for personalized vaccination.METHODSWe recently developed a process using "
"tumor-infiltrating lymphocytes to identify the specific immunogenic mutations expressed in "
"patients\' tumors. Here, validated, defined neoantigens, predicted neoepitopes, and "
"mutations of driver genes were concatenated into a single mRNA construct to vaccinate "
"patients with metastatic gastrointestinal cancer.RESULTSThe vaccine was safe and elicited "
"mutation-specific T cell responses against predicted neoepitopes not detected before "
"vaccination. Furthermore, we were able to isolate and verify T cell receptors targeting "
"KRASG12D mutation. We observed no objective clinical responses in the 4 patients treated in "
"this trial.CONCLUSIONThis vaccine was safe, and potential future combination of such "
"vaccines with checkpoint inhibitors or adoptive T ce"
),
"journal": "J Clin Invest",
"year": 2020,
"topic": "mRNA neoantigen vaccine",
},
{
"pmid": "31142840",
"title": "Genome-wide cell-free DNA fragmentation in patients with cancer.",
"abstract": (
"Cristiano S et al. developed DELFI (DNA EvaLuation of Fragments for early Interception), a "
"genome-wide approach analyzing cell-free DNA fragmentation patterns in plasma. Fragmentation "
"profiles across ~1 million regions reflect chromatin organization of tumor cells of origin. "
"Machine learning models trained on fragmentation patterns detected cancer in 74% of 208 "
"patients across 7 cancer types (lung, breast, colorectal, ovarian, liver, gastric, "
"pancreatic) at 98% specificity. Early-stage detection sensitivity was 57% for Stage I/II. "
"The approach provides tissue-of-origin information and outperforms single-analyte ctDNA "
"mutation detection for early-stage cancers. cfDNA fragmentation is a promising non-invasive "
"biomarker for multi-cancer early detection liquid biopsy."
),
"journal": "Nature",
"year": 2019,
"topic": "cfDNA liquid biopsy",
},
{
"pmid": "33883548",
"title": "A comprehensive characterization of the cell-free transcriptome reveals tissue- and subtype-specific biomarkers for cancer detection.",
"abstract": (
"Cell-free RNA (cfRNA) is a promising analyte for cancer detection. However, a comprehensive "
"assessment of cfRNA in individuals with and without cancer has not been conducted. We "
"perform the first transcriptome-wide characterization of cfRNA in cancer (stage III breast "
"[n = 46], lung [n = 30]) and non-cancer (n = 89) participants from the Circulating Cell-free "
"Genome Atlas (NCT02889978). Of 57,820 annotated genes, 39,564 (68%) are not detected in "
"cfRNA from non-cancer individuals. Within these low-noise regions, we identify tissue- and "
"cancer-specific genes, defined as \"dark channel biomarker\" (DCB) genes, that are "
"recurrently detected in individuals with cancer. DCB levels in plasma correlate with tumor "
"shedding rate and RNA expression in matched tissue, suggesting that DCBs with high "
"expression in tumor tissue could enhance cancer detection in patients with low levels of "
"circulating tumor DNA. Overall, cfRNA provides a unique opportunity to detect cancer, "
"predict the tumor tissue of origin, and determine the cancer subtype."
),
"journal": "Nat Commun",
"year": 2021,
"topic": "cfRNA liquid biopsy",
},
]
# ─────────────────────────────────────────────
# RAG ENGINE
# ─────────────────────────────────────────────
_rag_index = None
_rag_embeddings = None
_rag_model = None
EMBED_MODEL = "all-MiniLM-L6-v2" # 80 MB, runs on CPU, no API key
def _build_index():
"""Build FAISS index from paper corpus. Called once at startup."""
global _rag_index, _rag_embeddings, _rag_model
try:
from sentence_transformers import SentenceTransformer
import faiss
except ImportError:
return False, "sentence-transformers or faiss-cpu not installed. Run: pip install sentence-transformers faiss-cpu"
_rag_model = SentenceTransformer(EMBED_MODEL)
# Build text chunks: title + abstract for each paper
texts = []
for paper in PAPER_CORPUS:
chunk = f"Title: {paper['title']}\nAbstract: {paper['abstract']}\nJournal: {paper['journal']} ({paper['year']})"
texts.append(chunk)
_rag_embeddings = _rag_model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
_rag_embeddings = _rag_embeddings / np.linalg.norm(_rag_embeddings, axis=1, keepdims=True) # normalize
dim = _rag_embeddings.shape[1]
_rag_index = faiss.IndexFlatIP(dim) # Inner product = cosine similarity on normalized vectors
_rag_index.add(_rag_embeddings.astype(np.float32))
return True, f"Index built: {len(PAPER_CORPUS)} papers, {dim}-dim embeddings"
def _confidence_flag(score: float, n_results: int) -> str:
"""Assign confidence based on retrieval score."""
if score >= 0.55 and n_results >= 2:
return "🟢 HIGH"
elif score >= 0.35:
return "🟡 MEDIUM"
else:
return "🔴 SPECULATIVE"
def rag_query(question: str, top_k: int = 3) -> str:
"""Query the RAG index and return a grounded answer."""
global _rag_index, _rag_model
if _rag_index is None:
ok, msg = _build_index()
if not ok:
return f"⚠️ RAG system unavailable: {msg}"
try:
from sentence_transformers import SentenceTransformer
import faiss
except ImportError:
return "⚠️ Required packages not installed: `pip install sentence-transformers faiss-cpu`"
# Encode query
q_emb = _rag_model.encode([question], convert_to_numpy=True, show_progress_bar=False)
q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
# Search
scores, indices = _rag_index.search(q_emb.astype(np.float32), top_k)
scores = scores[0]
indices = indices[0]
# Filter: only use results above minimum threshold
MIN_SCORE = 0.20
valid = [(s, i) for s, i in zip(scores, indices) if s >= MIN_SCORE and i >= 0]
if not valid:
return (
"❌ **No relevant information found in the indexed papers.**\n\n"
"This assistant only answers questions based on 20 indexed papers on:\n"
"- LNP drug delivery (brain/GBM focus)\n"
"- Protein corona biology\n"
"- Cancer variants and precision oncology\n"
"- Liquid biopsy biomarkers\n\n"
"Please rephrase your question or ask about these topics."
)
top_score = valid[0][0]
confidence = _confidence_flag(top_score, len(valid))
# Build answer from retrieved chunks
answer_parts = [f"**Confidence: {confidence}** (retrieval score: {top_score:.3f})\n"]
for rank, (score, idx) in enumerate(valid, 1):
paper = PAPER_CORPUS[idx]
answer_parts.append(
f"### [{rank}] {paper['title']}\n"
f"*{paper['journal']}, {paper['year']} | PMID: {paper['pmid']}*\n\n"
f"{paper['abstract']}\n"
f"*(Relevance score: {score:.3f})*"
)
answer_parts.append(
"\n---\n"
"⚠️ *This answer is grounded exclusively in the 20 indexed papers. "
"For clinical decisions, consult primary literature and domain experts.*"
)
return "\n\n".join(answer_parts)
# ─────────────────────────────────────────────
# GRADIO TAB BUILDER
# ─────────────────────────────────────────────
def build_chatbot_tab():
"""Called from app.py to inject the chatbot into Tab A6."""
gr.Markdown(
"**Status:** Model loads on first query (~30s)...\n\n"
"Ask questions about LNP delivery, protein corona, cancer variants, or liquid biopsy. "
"Answers are grounded in 20 indexed papers — never fabricated."
)
with gr.Row():
with gr.Column(scale=3):
chatbox = gr.Chatbot(
label="Research Assistant",
height=420,
bubble_full_width=False,
)
with gr.Row():
user_input = gr.Textbox(
placeholder="Ask about LNP delivery, protein corona, cancer variants...",
label="Your question",
lines=2,
scale=4,
)
send_btn = gr.Button("Send", variant="primary", scale=1)
clear_btn = gr.Button("🗑️ Clear conversation", size="sm")
with gr.Column(scale=1):
gr.Markdown("### 📚 Indexed Topics")
gr.Markdown(
"**LNP Delivery**\n"
"- mRNA-LNP formulation\n"
"- Ionizable lipids & pKa\n"
"- Brain/GBM delivery\n"
"- Organ selectivity (SORT)\n"
"- PEG & anti-PEG immunity\n\n"
"**Protein Corona**\n"
"- Hard vs soft corona\n"
"- Vroman effect kinetics\n"
"- ApoE/LDLR targeting\n\n"
"**Cancer Variants**\n"
"- TP53 mutation spectrum\n"
"- KRAS G12C resistance\n"
"- ClinVar classification\n\n"
"**Liquid Biopsy**\n"
"- ctDNA methylation\n"
"- cfRNA biomarkers"
)
gr.Markdown(
"### 🔑 Confidence Flags\n"
"🟢 **HIGH** — strong match (≥0.55)\n"
"🟡 **MEDIUM** — moderate match (0.35–0.55)\n"
"🔴 **SPECULATIVE** — weak match (<0.35)\n\n"
"*Only answers from indexed papers are shown.*"
)
def respond(message, history):
if not message.strip():
return history, ""
answer = rag_query(message.strip())
history = history or []
history.append((message, answer))
return history, ""
send_btn.click(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
user_input.submit(respond, inputs=[user_input, chatbox], outputs=[chatbox, user_input])
clear_btn.click(lambda: ([], ""), outputs=[chatbox, user_input])
# ─────────────────────────────────────────────
# STANDALONE MODE
# ─────────────────────────────────────────────
if __name__ == "__main__":
print("Building RAG index...")
ok, msg = _build_index()
print(msg)
with gr.Blocks(title="K R&D Lab — Research Assistant") as demo:
gr.Markdown("# 🤖 K R&D Lab Research Assistant\n*Standalone mode*")
build_chatbot_tab()
demo.launch(share=False)