Spaces:

KaiserML
/

Arxivss

Runtime error

File size: 4,890 Bytes

import datetime
import math

from datasets import load_dataset
from sentence_transformers import SentenceTransformer

import gradio as gr

def boolean_search(paragraph, query):
    # Split paragraph into words
    words = paragraph.lower().split()
    words_dict = dict.fromkeys(words, True)

    # Split query into words
    query_words = query.lower().split()

    result = words_dict.get(query_words[0], False)

    for i in range(1, len(query_words), 2):
        operator = query_words[i]
        operand = words_dict.get(query_words[i + 1], False)

        if operator == 'and':
            result = result and operand
        elif operator == 'or':
            result = result or operand
        elif operator == 'not':
            result = result and not operand

    return result

def parse_retrieved(retrieved_examples,scores,filters,k):

  results=[]
  repo_avail,in_date,boolmet=len(scores),len(scores),len(scores)

  for i in range(len(scores)):

    resdict={}
    for key in keys:
      resdict[key] = retrieved_examples[key][i]
    resdict['arxiv_url'] = "https://arxiv.org/abs/{}".format(retrieved_examples['id'][i])
    resdict['pdf_url'] = "https://arxiv.org/pdf/{}.pdf".format(retrieved_examples['id'][i])
    resdict['published'] = retrieved_examples['versions'][0][0]['created']
    resdict['year'] = datetime.datetime.strptime(resdict['published'], "%a, %d %b %Y %H:%M:%S %Z").year
    resdict['score'] = str(round(scores[i],3))[:5]
    relevant=True

    if resdict['repo_url']==None:
      repo_avail-=1
      resdict['repo_url']=""
      if filters['limit2_pwc']:
        relevant=False

    if filters['sy']>resdict['year'] or filters['ey']<resdict['year']:
      relevant=False
      in_date-=1
    print(filters['boolean_terms'])
    if filters['boolean_terms']!="":
      boolean_met=boolean_search(resdict['abstract'], filters['boolean_terms'])
      if not boolean_met:
        relevant=False
        boolmet-=1

    if relevant:
      results.append(resdict)

  return [results[:k],repo_avail,in_date,boolmet]

def create_metadata_html(metadata_dict):
    html = '''
    <div style="border: 1px solid #ccc; padding: 10px; background-color: #f9f9f9;">
        <h2>{title}</h2>
        <pre><p><strong>Relevance_score:</strong> {score}    <strong>Published:</strong> {published}</p></pre>
        <p><strong>Authors:</strong> {authors}</p>
        <pre><p><strong>Categories:</strong> {categories}      <strong>Year:</strong> {year}</p></pre>
        <pre><p><a href="{arxiv_url}"><strong>ArXiv URL</strong></a>    <a href="{pdf_url}"><strong>PDF URL</strong></a></p></pre>     
        <p><strong>Abstract:</strong> {abstract}</p>
        <p><strong>Repo URL:</strong> <a href="{repo_url}">{repo_url}</a><p>
    </div>
    '''
    return html.format(**metadata_dict)

def search(query, boolean_terms, sy, ey,limit2_pwc):

  k=30

  question_embedding = model.encode(query)
  scores, retrieved_examples = ds['train'].get_nearest_examples('embeddings', question_embedding, k=100)

  filters={'limit2_pwc':limit2_pwc,'sy':sy,'ey':ey,'boolean_terms':boolean_terms}

  results = parse_retrieved(retrieved_examples,scores,filters,k)

  divs=[create_metadata_html(r) for r in results[0]]
  divs.reverse()

  html="<br><br><pre><strong>Articles with Repo:</strong> {}    <strong>Articles in date range:</strong> {}    <strong>Articles meeting boolean terms:</strong> {}</pre><br><strong>Top 30 results returned</strong><br>".format(str(results[1]),str(results[2]),str(results[3]))+"<br>".join(divs)
  return html


global keys
keys = ['title','authors','categories','abstract','repo_url','is_official','mentioned_in_paper']


ds = load_dataset("Corran/Arxiv_V12July23_Post2013CS_AllMiniV2L6")
ds['train'].add_faiss_index(column='embeddings')

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


search_interface = gr.Blocks()

with search_interface:
    fn = search, 
    inputs=[
      gr.Textbox(label="Query",value="",info="Search Query"),
      gr.Textbox(label="Boolean Terms",value="",info="Simple boolean conditions on words contained in the abstract (AND OR and NOT accepted for individual words, exact phrase isn't supported)"),
      gr.Slider(2013, 2023,step=1, value=2013, label="Start Year", info="Choose the earliest date for papers retrieved"),
      gr.Slider(2013, 2023,step=1, value=2023, label="End Year", info="Choose the latest date for papers retrieved"),
      gr.Checkbox(value=False,label="Limit results to those with a link to a github repo via pwc")
    ]
    run = gr.Button(label="Search")
    examples=[
        ["We research the use of chatgpt on scientific article summarisation. Summaries are of scientific articles", "chatgpt AND NOT gpt3", 2013, 2023, True],
    ]
    output=gr.outputs.HTML()
    run.click(fn=search, inputs=inputs, outputs=output, api_name="Arxiv Semantic Search")

search_interface.launch()