gamingflexer commited on
Commit
3523c03
·
1 Parent(s): cc93c45

app updated for search

Browse files
Files changed (1) hide show
  1. src/app.py +41 -0
src/app.py CHANGED
@@ -5,6 +5,19 @@ from scrapper.main import ArxivPaper
5
  from config import *
6
  from db.db_functions import get_correct_author_name, insert_papers_data, fetch_papers_data, get_unquine_authors
7
  from utils import compare_paper_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.Progress()):
10
  number_of_results_fetch = int(number_of_results_fetch)
@@ -56,6 +69,24 @@ def fetch_papers_data_df(authors_name: str, progress=gr.Progress()):
56
  progress(0.8, desc="Making DataFrame")
57
  return pd.DataFrame(fetched_data)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  with gr.Blocks() as demo:
60
 
61
  with gr.Tab("Get Papers Data"):
@@ -83,8 +114,18 @@ with gr.Blocks() as demo:
83
  authors_name = gr.Textbox(label="Enter Author's Name")
84
  number_of_results = gr.Number(label="Number of results - Min - 5")
85
  submit_button = gr.Button("Start")
 
 
 
 
 
 
 
 
 
86
 
87
  submit_button_tab_1.click(fn=plagiarism_checker,inputs=[authors_name_fetch, number_of_results_fetch] ,outputs= completed)
88
  submit_button_tab_2.click(fn=fetch_papers_data_df,inputs=[authors_name_paper] ,outputs=dataframe_output)
 
89
 
90
  demo.launch()
 
5
  from config import *
6
  from db.db_functions import get_correct_author_name, insert_papers_data, fetch_papers_data, get_unquine_authors
7
  from utils import compare_paper_ids
8
+ import chromadb
9
+ from chromadb.config import Settings
10
+ from chromadb.utils import embedding_functions
11
+ from config import OPENAI_API_KEY
12
+
13
+ emmbedding_model = "text-embedding-3-large"
14
+ openai_ef = embedding_functions.OpenAIEmbeddingFunction(model_name=emmbedding_model,api_key=OPENAI_API_KEY)
15
+ if deploy:
16
+ chroma_client = chromadb.HttpClient(host='localhost', port=8000)
17
+ else:
18
+ chroma_client = chromadb.PersistentClient(path="/home/ubuntu/research/data/emeddeings")
19
+
20
+ collection_doc = chroma_client.get_or_create_collection(name="2024_document_lvl_test")
21
 
22
  def plagiarism_checker(authors_name_fetch,number_of_results_fetch, progress=gr.Progress()):
23
  number_of_results_fetch = int(number_of_results_fetch)
 
69
  progress(0.8, desc="Making DataFrame")
70
  return pd.DataFrame(fetched_data)
71
 
72
+ def embedding_searcher(embbed_text_search, top_k=4, progress=gr.Progress()):
73
+
74
+ data = collection_doc.query(query_embeddings = openai_ef([embbed_text_search]), n_results=top_k)
75
+ result = pd.DataFrame(data['ids'][0], columns=['ID'])
76
+ result['Distance'] = data['distances'][0]
77
+
78
+ # Extracting information from metadatas
79
+ metadata_list = data['metadatas'][0]
80
+ titles = [metadata['title'] for metadata in metadata_list]
81
+ authors = [metadata['authors'] for metadata in metadata_list]
82
+ sources = [metadata['source'] for metadata in metadata_list]
83
+
84
+ # Adding metadata columns to the dataframe
85
+ result['Title'] = titles
86
+ result['Authors'] = authors
87
+ result['Source'] = sources
88
+ return result
89
+
90
  with gr.Blocks() as demo:
91
 
92
  with gr.Tab("Get Papers Data"):
 
114
  authors_name = gr.Textbox(label="Enter Author's Name")
115
  number_of_results = gr.Number(label="Number of results - Min - 5")
116
  submit_button = gr.Button("Start")
117
+
118
+ with gr.Tab("Open Embeddings Search"):
119
+ with gr.Row():
120
+ embbed_text_search = gr.Textbox(label="Enter Text")
121
+ with gr.Row():
122
+ top_k = gr.Number(label="Number of results - Min 2")
123
+ with gr.Row():
124
+ submit_button_tab_4 = gr.Button("Start")
125
+ dataframe_output_tab_4 = gr.Dataframe(headers=['ID', 'Distance', 'Title', 'Authors', 'Source'])
126
 
127
  submit_button_tab_1.click(fn=plagiarism_checker,inputs=[authors_name_fetch, number_of_results_fetch] ,outputs= completed)
128
  submit_button_tab_2.click(fn=fetch_papers_data_df,inputs=[authors_name_paper] ,outputs=dataframe_output)
129
+ submit_button_tab_4.click(fn=embedding_searcher,inputs=[embbed_text_search, top_k] ,outputs= dataframe_output_tab_4)
130
 
131
  demo.launch()