File size: 4,890 Bytes
c6adab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92f0bd3
c6adab2
 
 
 
 
 
 
 
 
 
 
f0f56d8
c6adab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import datetime
import math

from datasets import load_dataset
from sentence_transformers import SentenceTransformer

import gradio as gr

def boolean_search(paragraph, query):
    # Split paragraph into words
    words = paragraph.lower().split()
    words_dict = dict.fromkeys(words, True)

    # Split query into words
    query_words = query.lower().split()

    result = words_dict.get(query_words[0], False)

    for i in range(1, len(query_words), 2):
        operator = query_words[i]
        operand = words_dict.get(query_words[i + 1], False)

        if operator == 'and':
            result = result and operand
        elif operator == 'or':
            result = result or operand
        elif operator == 'not':
            result = result and not operand

    return result

def parse_retrieved(retrieved_examples,scores,filters,k):

  results=[]
  repo_avail,in_date,boolmet=len(scores),len(scores),len(scores)

  for i in range(len(scores)):

    resdict={}
    for key in keys:
      resdict[key] = retrieved_examples[key][i]
    resdict['arxiv_url'] = "https://arxiv.org/abs/{}".format(retrieved_examples['id'][i])
    resdict['pdf_url'] = "https://arxiv.org/pdf/{}.pdf".format(retrieved_examples['id'][i])
    resdict['published'] = retrieved_examples['versions'][0][0]['created']
    resdict['year'] = datetime.datetime.strptime(resdict['published'], "%a, %d %b %Y %H:%M:%S %Z").year
    resdict['score'] = str(round(scores[i],3))[:5]
    relevant=True

    if resdict['repo_url']==None:
      repo_avail-=1
      resdict['repo_url']=""
      if filters['limit2_pwc']:
        relevant=False

    if filters['sy']>resdict['year'] or filters['ey']<resdict['year']:
      relevant=False
      in_date-=1
    print(filters['boolean_terms'])
    if filters['boolean_terms']!="":
      boolean_met=boolean_search(resdict['abstract'], filters['boolean_terms'])
      if not boolean_met:
        relevant=False
        boolmet-=1

    if relevant:
      results.append(resdict)

  return [results[:k],repo_avail,in_date,boolmet]

def create_metadata_html(metadata_dict):
    html = '''
    <div style="border: 1px solid #ccc; padding: 10px; background-color: #f9f9f9;">
        <h2>{title}</h2>
        <pre><p><strong>Relevance_score:</strong> {score}    <strong>Published:</strong> {published}</p></pre>
        <p><strong>Authors:</strong> {authors}</p>
        <pre><p><strong>Categories:</strong> {categories}      <strong>Year:</strong> {year}</p></pre>
        <pre><p><a href="{arxiv_url}"><strong>ArXiv URL</strong></a>    <a href="{pdf_url}"><strong>PDF URL</strong></a></p></pre>     
        <p><strong>Abstract:</strong> {abstract}</p>
        <p><strong>Repo URL:</strong> <a href="{repo_url}">{repo_url}</a><p>
    </div>
    '''
    return html.format(**metadata_dict)

def search(query, boolean_terms, sy, ey,limit2_pwc):

  k=30

  question_embedding = model.encode(query)
  scores, retrieved_examples = ds['train'].get_nearest_examples('embeddings', question_embedding, k=100)

  filters={'limit2_pwc':limit2_pwc,'sy':sy,'ey':ey,'boolean_terms':boolean_terms}

  results = parse_retrieved(retrieved_examples,scores,filters,k)

  divs=[create_metadata_html(r) for r in results[0]]
  divs.reverse()

  html="<br><br><pre><strong>Articles with Repo:</strong> {}    <strong>Articles in date range:</strong> {}    <strong>Articles meeting boolean terms:</strong> {}</pre><br><strong>Top 30 results returned</strong><br>".format(str(results[1]),str(results[2]),str(results[3]))+"<br>".join(divs)
  return html


global keys
keys = ['title','authors','categories','abstract','repo_url','is_official','mentioned_in_paper']


ds = load_dataset("Corran/Arxiv_V12July23_Post2013CS_AllMiniV2L6")
ds['train'].add_faiss_index(column='embeddings')

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


search_interface = gr.Blocks()

with search_interface:
    fn = search, 
    inputs=[
      gr.Textbox(label="Query",value="",info="Search Query"),
      gr.Textbox(label="Boolean Terms",value="",info="Simple boolean conditions on words contained in the abstract (AND OR and NOT accepted for individual words, exact phrase isn't supported)"),
      gr.Slider(2013, 2023,step=1, value=2013, label="Start Year", info="Choose the earliest date for papers retrieved"),
      gr.Slider(2013, 2023,step=1, value=2023, label="End Year", info="Choose the latest date for papers retrieved"),
      gr.Checkbox(value=False,label="Limit results to those with a link to a github repo via pwc")
    ]
    run = gr.Button(label="Search")
    examples=[
        ["We research the use of chatgpt on scientific article summarisation. Summaries are of scientific articles", "chatgpt AND NOT gpt3", 2013, 2023, True],
    ]
    output=gr.outputs.HTML()
    run.click(fn=search, inputs=inputs, outputs=output, api_name="Arxiv Semantic Search")

search_interface.launch()