File size: 4,890 Bytes
c6adab2 92f0bd3 c6adab2 f0f56d8 c6adab2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import datetime
import math
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import gradio as gr
def boolean_search(paragraph, query):
# Split paragraph into words
words = paragraph.lower().split()
words_dict = dict.fromkeys(words, True)
# Split query into words
query_words = query.lower().split()
result = words_dict.get(query_words[0], False)
for i in range(1, len(query_words), 2):
operator = query_words[i]
operand = words_dict.get(query_words[i + 1], False)
if operator == 'and':
result = result and operand
elif operator == 'or':
result = result or operand
elif operator == 'not':
result = result and not operand
return result
def parse_retrieved(retrieved_examples,scores,filters,k):
results=[]
repo_avail,in_date,boolmet=len(scores),len(scores),len(scores)
for i in range(len(scores)):
resdict={}
for key in keys:
resdict[key] = retrieved_examples[key][i]
resdict['arxiv_url'] = "https://arxiv.org/abs/{}".format(retrieved_examples['id'][i])
resdict['pdf_url'] = "https://arxiv.org/pdf/{}.pdf".format(retrieved_examples['id'][i])
resdict['published'] = retrieved_examples['versions'][0][0]['created']
resdict['year'] = datetime.datetime.strptime(resdict['published'], "%a, %d %b %Y %H:%M:%S %Z").year
resdict['score'] = str(round(scores[i],3))[:5]
relevant=True
if resdict['repo_url']==None:
repo_avail-=1
resdict['repo_url']=""
if filters['limit2_pwc']:
relevant=False
if filters['sy']>resdict['year'] or filters['ey']<resdict['year']:
relevant=False
in_date-=1
print(filters['boolean_terms'])
if filters['boolean_terms']!="":
boolean_met=boolean_search(resdict['abstract'], filters['boolean_terms'])
if not boolean_met:
relevant=False
boolmet-=1
if relevant:
results.append(resdict)
return [results[:k],repo_avail,in_date,boolmet]
def create_metadata_html(metadata_dict):
html = '''
<div style="border: 1px solid #ccc; padding: 10px; background-color: #f9f9f9;">
<h2>{title}</h2>
<pre><p><strong>Relevance_score:</strong> {score} <strong>Published:</strong> {published}</p></pre>
<p><strong>Authors:</strong> {authors}</p>
<pre><p><strong>Categories:</strong> {categories} <strong>Year:</strong> {year}</p></pre>
<pre><p><a href="{arxiv_url}"><strong>ArXiv URL</strong></a> <a href="{pdf_url}"><strong>PDF URL</strong></a></p></pre>
<p><strong>Abstract:</strong> {abstract}</p>
<p><strong>Repo URL:</strong> <a href="{repo_url}">{repo_url}</a><p>
</div>
'''
return html.format(**metadata_dict)
def search(query, boolean_terms, sy, ey,limit2_pwc):
k=30
question_embedding = model.encode(query)
scores, retrieved_examples = ds['train'].get_nearest_examples('embeddings', question_embedding, k=100)
filters={'limit2_pwc':limit2_pwc,'sy':sy,'ey':ey,'boolean_terms':boolean_terms}
results = parse_retrieved(retrieved_examples,scores,filters,k)
divs=[create_metadata_html(r) for r in results[0]]
divs.reverse()
html="<br><br><pre><strong>Articles with Repo:</strong> {} <strong>Articles in date range:</strong> {} <strong>Articles meeting boolean terms:</strong> {}</pre><br><strong>Top 30 results returned</strong><br>".format(str(results[1]),str(results[2]),str(results[3]))+"<br>".join(divs)
return html
global keys
keys = ['title','authors','categories','abstract','repo_url','is_official','mentioned_in_paper']
ds = load_dataset("Corran/Arxiv_V12July23_Post2013CS_AllMiniV2L6")
ds['train'].add_faiss_index(column='embeddings')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
search_interface = gr.Blocks()
with search_interface:
fn = search,
inputs=[
gr.Textbox(label="Query",value="",info="Search Query"),
gr.Textbox(label="Boolean Terms",value="",info="Simple boolean conditions on words contained in the abstract (AND OR and NOT accepted for individual words, exact phrase isn't supported)"),
gr.Slider(2013, 2023,step=1, value=2013, label="Start Year", info="Choose the earliest date for papers retrieved"),
gr.Slider(2013, 2023,step=1, value=2023, label="End Year", info="Choose the latest date for papers retrieved"),
gr.Checkbox(value=False,label="Limit results to those with a link to a github repo via pwc")
]
run = gr.Button(label="Search")
examples=[
["We research the use of chatgpt on scientific article summarisation. Summaries are of scientific articles", "chatgpt AND NOT gpt3", 2013, 2023, True],
]
output=gr.outputs.HTML()
run.click(fn=search, inputs=inputs, outputs=output, api_name="Arxiv Semantic Search")
search_interface.launch()
|