|
import requests |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
import nltk |
|
nltk.download('punkt') |
|
from nltk.tokenize import sent_tokenize |
|
|
|
from transformers import pipeline |
|
from config import MY_PUBMED_EMAIL |
|
|
|
|
|
summarizer = pipeline( |
|
"summarization", |
|
model="facebook/bart-large-cnn", |
|
tokenizer="facebook/bart-large-cnn" |
|
) |
|
|
|
def search_pubmed(query, max_results=3): |
|
""" |
|
Searches PubMed via ESearch. Returns list of PMIDs. |
|
""" |
|
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" |
|
params = { |
|
"db": "pubmed", |
|
"term": query, |
|
"retmax": max_results, |
|
"retmode": "json", |
|
"tool": "ElysiumRAG", |
|
"email": MY_PUBMED_EMAIL |
|
} |
|
resp = requests.get(base_url, params=params) |
|
resp.raise_for_status() |
|
data = resp.json() |
|
return data.get("esearchresult", {}).get("idlist", []) |
|
|
|
def fetch_one_abstract(pmid): |
|
""" |
|
Fetches a single abstract for the given PMID. |
|
""" |
|
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" |
|
params = { |
|
"db": "pubmed", |
|
"retmode": "text", |
|
"rettype": "abstract", |
|
"id": pmid, |
|
"tool": "ElysiumRAG", |
|
"email": MY_PUBMED_EMAIL |
|
} |
|
resp = requests.get(base_url, params=params) |
|
resp.raise_for_status() |
|
raw_text = resp.text.strip() or "No abstract text found." |
|
return (pmid, raw_text) |
|
|
|
def fetch_pubmed_abstracts(pmids): |
|
""" |
|
Parallel fetching of multiple abstracts. |
|
""" |
|
if not pmids: |
|
return {} |
|
results_map = {} |
|
with ThreadPoolExecutor(max_workers=min(len(pmids), 5)) as executor: |
|
future_to_pmid = {executor.submit(fetch_one_abstract, pmid): pmid for pmid in pmids} |
|
for future in as_completed(future_to_pmid): |
|
pmid = future_to_pmid[future] |
|
try: |
|
pmid_result, text = future.result() |
|
results_map[pmid_result] = text |
|
except Exception as e: |
|
results_map[pmid] = f"Error: {str(e)}" |
|
return results_map |
|
|
|
def chunk_and_summarize(abstract_text, chunk_size=512): |
|
""" |
|
Splits large abstracts by sentences, summarizes each chunk, then concatenates. |
|
""" |
|
sentences = sent_tokenize(abstract_text) |
|
chunks = [] |
|
|
|
current_chunk = [] |
|
current_length = 0 |
|
for sent in sentences: |
|
tokens_in_sent = len(sent.split()) |
|
if current_length + tokens_in_sent > chunk_size: |
|
chunks.append(" ".join(current_chunk)) |
|
current_chunk = [] |
|
current_length = 0 |
|
current_chunk.append(sent) |
|
current_length += tokens_in_sent |
|
|
|
if current_chunk: |
|
chunks.append(" ".join(current_chunk)) |
|
|
|
summarized_pieces = [] |
|
for c in chunks: |
|
summary_out = summarizer( |
|
c, |
|
max_length=100, |
|
min_length=30, |
|
do_sample=False |
|
) |
|
summarized_pieces.append(summary_out[0]['summary_text']) |
|
|
|
return " ".join(summarized_pieces).strip() |
|
|