Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
|
|
2 |
import requests
|
3 |
import torch
|
4 |
from transformers import AutoTokenizer, AutoModel
|
|
|
5 |
|
6 |
# Load SciBERT pre-trained model and tokenizer
|
7 |
model_name = "allenai/scibert_scivocab_uncased"
|
@@ -9,44 +10,59 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
9 |
model = AutoModel.from_pretrained(model_name)
|
10 |
|
11 |
def calculate_similarity(claim, document):
|
|
|
|
|
12 |
# Tokenize claim and document
|
13 |
inputs = tokenizer.encode_plus(claim, document, return_tensors='pt', padding=True, truncation=True)
|
14 |
|
15 |
-
# Generate embeddings for claim
|
16 |
with torch.no_grad():
|
17 |
claim_embeddings = model(**inputs)['pooler_output']
|
18 |
|
|
|
|
|
|
|
|
|
|
|
19 |
# Compute cosine similarity between embeddings
|
20 |
similarity = torch.cosine_similarity(claim_embeddings, document_embeddings).item()
|
21 |
|
22 |
return similarity
|
23 |
|
24 |
-
def
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
'title': 'Paper 1 Title',
|
32 |
-
'abstract': 'Paper 1 Abstract',
|
33 |
-
'authors': ['Author 1', 'Author 2'],
|
34 |
-
'url': 'https://example.com/paper1'
|
35 |
-
},
|
36 |
-
{
|
37 |
-
'title': 'Paper 2 Title',
|
38 |
-
'abstract': 'Paper 2 Abstract',
|
39 |
-
'authors': ['Author 3', 'Author 4'],
|
40 |
-
'url': 'https://example.com/paper2'
|
41 |
-
},
|
42 |
-
{
|
43 |
-
'title': 'Paper 3 Title',
|
44 |
-
'abstract': 'Paper 3 Abstract',
|
45 |
-
'authors': ['Author 5', 'Author 6'],
|
46 |
-
'url': 'https://example.com/paper3'
|
47 |
-
}
|
48 |
-
]
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
return search_results
|
51 |
|
52 |
st.title('The Substantiator')
|
@@ -56,12 +72,13 @@ user_input = st.text_input('Input your claim')
|
|
56 |
if st.button('Substantiate'):
|
57 |
search_results = search_papers(user_input)
|
58 |
if search_results is not None and len(search_results) > 0:
|
59 |
-
for
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
66 |
else:
|
67 |
-
st.write("No results found.")
|
|
|
2 |
import requests
|
3 |
import torch
|
4 |
from transformers import AutoTokenizer, AutoModel
|
5 |
+
import xml.etree.ElementTree as ET
|
6 |
|
7 |
# Load SciBERT pre-trained model and tokenizer
|
8 |
model_name = "allenai/scibert_scivocab_uncased"
|
|
|
10 |
model = AutoModel.from_pretrained(model_name)
|
11 |
|
12 |
def calculate_similarity(claim, document):
|
13 |
+
if not claim or not document:
|
14 |
+
return 0.0
|
15 |
# Tokenize claim and document
|
16 |
inputs = tokenizer.encode_plus(claim, document, return_tensors='pt', padding=True, truncation=True)
|
17 |
|
18 |
+
# Generate embeddings for claim
|
19 |
with torch.no_grad():
|
20 |
claim_embeddings = model(**inputs)['pooler_output']
|
21 |
|
22 |
+
# Generate embeddings for document
|
23 |
+
inputs_doc = tokenizer.encode_plus(document, return_tensors='pt', padding=True, truncation=True)
|
24 |
+
with torch.no_grad():
|
25 |
+
document_embeddings = model(**inputs_doc)['pooler_output']
|
26 |
+
|
27 |
# Compute cosine similarity between embeddings
|
28 |
similarity = torch.cosine_similarity(claim_embeddings, document_embeddings).item()
|
29 |
|
30 |
return similarity
|
31 |
|
32 |
+
def search_arxiv(query, max_results=3):
|
33 |
+
base_url = "http://export.arxiv.org/api/query?"
|
34 |
+
query = f"search_query=all:{query}&start=0&max_results={max_results}&sortBy=relevance&sortOrder=descending"
|
35 |
+
|
36 |
+
try:
|
37 |
+
response = requests.get(base_url + query)
|
38 |
+
if response.status_code == 200:
|
39 |
+
data = response.text
|
40 |
+
|
41 |
+
# Parse the XML response
|
42 |
+
root = ET.fromstring(data)
|
43 |
|
44 |
+
search_results = []
|
45 |
+
for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
|
46 |
+
result = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
# Extract information from each entry
|
49 |
+
result["title"] = entry.find("{http://www.w3.org/2005/Atom}title").text
|
50 |
+
result["abstract"] = entry.find("{http://arxiv.org/schemas/atom}summary").text
|
51 |
+
|
52 |
+
authors = []
|
53 |
+
for author in entry.findall("{http://www.w3.org/2005/Atom}author"):
|
54 |
+
authors.append(author.find("{http://www.w3.org/2005/Atom}name").text)
|
55 |
+
result["authors"] = authors
|
56 |
+
|
57 |
+
search_results.append(result)
|
58 |
+
|
59 |
+
return search_results
|
60 |
+
except:
|
61 |
+
return None
|
62 |
+
|
63 |
+
def search_papers(user_input):
|
64 |
+
# Use the desired search function, e.g., search_arxiv
|
65 |
+
search_results = search_arxiv(user_input)
|
66 |
return search_results
|
67 |
|
68 |
st.title('The Substantiator')
|
|
|
72 |
if st.button('Substantiate'):
|
73 |
search_results = search_papers(user_input)
|
74 |
if search_results is not None and len(search_results) > 0:
|
75 |
+
with st.spinner('Searching for relevant research papers...'):
|
76 |
+
for result in search_results:
|
77 |
+
st.write(f"<a href='javascript:void(0)' onclick='window.open(\"{result['link']}\", \"_blank\");return false;'>{result['title']}</a>", unsafe_allow_html=True)
|
78 |
+
st.write(result["abstract"])
|
79 |
+
st.write("Authors: ", ", ".join(result["authors"]))
|
80 |
+
similarity = calculate_similarity(user_input, result["abstract"])
|
81 |
+
st.write("Similarity Score: ", similarity)
|
82 |
+
st.write("-----")
|
83 |
else:
|
84 |
+
st.write("No results found.")
|