File size: 3,524 Bytes
131f93a
 
 
 
 
 
 
 
 
 
 
 
6006a71
 
131f93a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42ac69b
131f93a
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import requests
from bs4 import BeautifulSoup
import time
import json
import xml.etree.ElementTree as ET

# Move models to CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("stanford-crfm/BioMedLM", model_max_length= 1024)
model = AutoModelForCausalLM.from_pretrained("stanford-crfm/BioMedLM").to(device)

api_key = '2c78468d6246082d456a140bb1de415ed108'
num_results = 10


def extract_longer_answers_from_paragraphs(paragraphs, query, tokenizer, model):
    context = " ".join(paragraphs)
    question = f"What is the mechanism of {query}?"
    context += question
    inputs = tokenizer(context, return_tensors="pt", add_special_tokens=False, output_attentions=False).to(device)
    top_p = 0.9  # Adjust as needed
    max_len = 50  # Adjust as needed
    outputs = model.generate(
        **inputs,
        top_p=top_p,
        max_length=max_len,
        num_beams=1,  # Adjust as needed
        no_repeat_ngram_size=2  # Adjust as needed
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return answer


def retrieve_and_answer(query1, query2):
    combined_query1 = f"({query1}) AND ({query2})"
    answer = fetch_and_generate(query1, combined_query, tokenizer, model)
   

    return answer1, answer2

def fetch_and_generate(query, combined_query, tokenizer, model):
    esearch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&api_key={api_key}&term={combined_query}&retmax={num_results}&sort=relevance"
    headers = {'Accept': 'application/json'}
    response = requests.get(esearch_url, headers=headers)

    root = ET.fromstring(response.text)

    if response.status_code == 200:
        paragraphs = []

        for article_id in root.find('IdList'):
            article_id = article_id.text
            efetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&api_key={api_key}&id={article_id}&retmode=xml"
            response = requests.get(efetch_url)

            if response.status_code == 200:
                article_data = response.text
                soup = BeautifulSoup(article_data, 'xml')
                articles = soup.find_all('PubmedArticle')

                for article in articles:
                    title = article.find('ArticleTitle')

                    if title:
                        title_text = title.text
                        
                        if article.find('AbstractText'):
                            paragraphs.append(article.find('AbstractText').text)

            else:
                print("Error:", response.status_code)
                time.sleep(3)

        answer = extract_longer_answers_from_paragraphs(paragraphs, query, tokenizer, model)
        return answer

    else:
        print("Error:", response.status_code)
        return "Error fetching articles.", []


# Gradio Interface
iface = gr.Interface(
    fn=retrieve_and_answer,
    inputs=[gr.Textbox(placeholder="Enter Query 1", label= 'query1'), 
            gr.Textbox(placeholder="Enter Query 2", label= 'query2')],
    outputs=[ gr.Textbox(placeholder="Answer from BioMedLM"), ],
    live=True,
    title="PubMed Question Answering: Stanford/BioMedLM",
    description="Enter two queries to retrieve PubMed articles",
    examples=[
        ["sertraline", "mechanism"],
        ["cancer", "treatment"]
    ]
)

iface.launch()