Spaces:
Runtime error
Runtime error
gamingflexer
commited on
Commit
·
3523c03
1
Parent(s):
cc93c45
app updated for search
Browse files- src/app.py +41 -0
src/app.py
CHANGED
@@ -5,6 +5,19 @@ from scrapper.main import ArxivPaper
|
|
5 |
from config import *
|
6 |
from db.db_functions import get_correct_author_name, insert_papers_data, fetch_papers_data, get_unquine_authors
|
7 |
from utils import compare_paper_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.Progress()):
|
10 |
number_of_results_fetch = int(number_of_results_fetch)
|
@@ -56,6 +69,24 @@ def fetch_papers_data_df(authors_name: str, progress=gr.Progress()):
|
|
56 |
progress(0.8, desc="Making DataFrame")
|
57 |
return pd.DataFrame(fetched_data)
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
with gr.Blocks() as demo:
|
60 |
|
61 |
with gr.Tab("Get Papers Data"):
|
@@ -83,8 +114,18 @@ with gr.Blocks() as demo:
|
|
83 |
authors_name = gr.Textbox(label="Enter Author's Name")
|
84 |
number_of_results = gr.Number(label="Number of results - Min - 5")
|
85 |
submit_button = gr.Button("Start")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
submit_button_tab_1.click(fn=plagiarism_checker,inputs=[authors_name_fetch, number_of_results_fetch] ,outputs= completed)
|
88 |
submit_button_tab_2.click(fn=fetch_papers_data_df,inputs=[authors_name_paper] ,outputs=dataframe_output)
|
|
|
89 |
|
90 |
demo.launch()
|
|
|
5 |
from config import *
|
6 |
from db.db_functions import get_correct_author_name, insert_papers_data, fetch_papers_data, get_unquine_authors
|
7 |
from utils import compare_paper_ids
|
8 |
+
import chromadb
|
9 |
+
from chromadb.config import Settings
|
10 |
+
from chromadb.utils import embedding_functions
|
11 |
+
from config import OPENAI_API_KEY
|
12 |
+
|
13 |
+
emmbedding_model = "text-embedding-3-large"
|
14 |
+
openai_ef = embedding_functions.OpenAIEmbeddingFunction(model_name=emmbedding_model,api_key=OPENAI_API_KEY)
|
15 |
+
if deploy:
|
16 |
+
chroma_client = chromadb.HttpClient(host='localhost', port=8000)
|
17 |
+
else:
|
18 |
+
chroma_client = chromadb.PersistentClient(path="/home/ubuntu/research/data/emeddeings")
|
19 |
+
|
20 |
+
collection_doc = chroma_client.get_or_create_collection(name="2024_document_lvl_test")
|
21 |
|
22 |
def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.Progress()):
|
23 |
number_of_results_fetch = int(number_of_results_fetch)
|
|
|
69 |
progress(0.8, desc="Making DataFrame")
|
70 |
return pd.DataFrame(fetched_data)
|
71 |
|
72 |
+
def embedding_searcher(embbed_text_search, top_k=4, progress=gr.Progress()):
|
73 |
+
|
74 |
+
data = collection_doc.query(query_embeddings = openai_ef([embbed_text_search]), n_results=top_k)
|
75 |
+
result = pd.DataFrame(data['ids'][0], columns=['ID'])
|
76 |
+
result['Distance'] = data['distances'][0]
|
77 |
+
|
78 |
+
# Extracting information from metadatas
|
79 |
+
metadata_list = data['metadatas'][0]
|
80 |
+
titles = [metadata['title'] for metadata in metadata_list]
|
81 |
+
authors = [metadata['authors'] for metadata in metadata_list]
|
82 |
+
sources = [metadata['source'] for metadata in metadata_list]
|
83 |
+
|
84 |
+
# Adding metadata columns to the dataframe
|
85 |
+
result['Title'] = titles
|
86 |
+
result['Authors'] = authors
|
87 |
+
result['Source'] = sources
|
88 |
+
return result
|
89 |
+
|
90 |
with gr.Blocks() as demo:
|
91 |
|
92 |
with gr.Tab("Get Papers Data"):
|
|
|
114 |
authors_name = gr.Textbox(label="Enter Author's Name")
|
115 |
number_of_results = gr.Number(label="Number of results - Min - 5")
|
116 |
submit_button = gr.Button("Start")
|
117 |
+
|
118 |
+
with gr.Tab("Open Embeddings Search"):
|
119 |
+
with gr.Row():
|
120 |
+
embbed_text_search = gr.Textbox(label="Enter Text")
|
121 |
+
with gr.Row():
|
122 |
+
top_k = gr.Number(label="Number of results - Min 2")
|
123 |
+
with gr.Row():
|
124 |
+
submit_button_tab_4 = gr.Button("Start")
|
125 |
+
dataframe_output_tab_4 = gr.Dataframe(headers=['ID', 'Distance', 'Title', 'Authors', 'Source'])
|
126 |
|
127 |
submit_button_tab_1.click(fn=plagiarism_checker,inputs=[authors_name_fetch, number_of_results_fetch] ,outputs= completed)
|
128 |
submit_button_tab_2.click(fn=fetch_papers_data_df,inputs=[authors_name_paper] ,outputs=dataframe_output)
|
129 |
+
submit_button_tab_4.click(fn=embedding_searcher,inputs=[embbed_text_search, top_k] ,outputs= dataframe_output_tab_4)
|
130 |
|
131 |
demo.launch()
|