import gradio as gr from sentence_transformers import SentenceTransformer, util import torch import pandas as pd import faiss import numpy as np model = SentenceTransformer("Elise-hf/distilbert-base-pwc-multi-task") # Load the Faiss indexes and data files index = faiss.read_index("all_inst_index") tasks_index = faiss.read_index("tasks_index") methods_index = faiss.read_index("methods_index") labels = pd.read_json('lbl_gpt3_davinci_clean_with_counts.json') methods = pd.read_json('methods.json') papers = pd.read_json('title_url_clean.json') tasks_embeddings = np.load('tasks_embeddings.npy') methods_embeddings = np.load('methods_embeddings.npy') def search_faiss_single(index, inst_embeddings, top_k): # faiss.normalize_L2(inst_embeddings) D, I = index.search(inst_embeddings, top_k) return D, I def find_similar_papers_tasks_methods(title, abstract, k=100): # Load the pre-trained model # Add the title and the abstract together query = title + '' + abstract # Encode the query sentence into an embedding query_embedding = model.encode([query], convert_to_numpy=True) # Search for the top k most similar papers D, I = search_faiss_single(index, query_embedding, k) # Search for the top k most similar tasks D_tasks, I_tasks = search_faiss_single(tasks_index, query_embedding, k) norm = np.linalg.norm(tasks_embeddings[I_tasks[0]], axis=1) * np.linalg.norm(query_embedding, axis=1)[:, None] D_tasks /= norm # Search for the top k most similar methods D_methods, I_methods = search_faiss_single(methods_index, query_embedding, k) norm = np.linalg.norm(methods_embeddings[I_methods[0]], axis=1) * np.linalg.norm(query_embedding, axis=1)[:, None] D_methods /= norm # Create a dictionary of the top k similar tasks and their cosine similarities tasks_results = dict(zip(labels.loc[I_tasks[0]].title, D_tasks[0].tolist())) # Create a dictionary of the top k similar methods and their cosine similarities methods_results = dict(zip(methods.loc[I_methods[0]].title, D_methods[0].tolist())) # Return the dictionaries of the top k similar tasks and methods, and the dataframe of the top k similar papers return tasks_results, methods_results,papers.loc[I[0]] with gr.Blocks() as demo: with gr.TabItem("Task Search"): gr.Markdown( """ # Identify Relevant Tasks from Abstracts """ ) title = gr.components.Textbox(label="Enter an paper's title") abstract = gr.components.Textbox(label="Enter an abstract to discover relevant tasks from it") btn = gr.Button("Submit") with gr.Row(): tasks_table = gr.components.Label(label="Relevant Tasks from PapersWithCode") methods_table = gr.components.Label(label="Relevant Methods from PapersWithCode") output_df = gr.Dataframe( headers=["title", "paper_url"], datatype=["str", "str"], row_count=10, col_count=(2, "fixed"), label="Relevant papers from PapersWithCode" ) btn.click(fn=find_similar_papers_tasks_methods, inputs=[title, abstract], outputs=[tasks_table, methods_table,output_df]) # gr.Examples(examples, inputs=[title, abstract], cache_examples=True, fn=find_similar_papers_tasks_methods, # outputs=[tasks_table, methods_table,output_df]) demo.launch()