kiyer commited on
Commit
036767e
β€’
1 Parent(s): 2b8b035

switching to gradio for better memory handling.

Browse files

streamlit doesn't handle heavy traffic as well, so keeping gradio's queue for now. access the original streamlit version at https://huggingface.co/spaces/kiyer/pathfinder_v3

Files changed (3) hide show
  1. README.md +3 -3
  2. app_gradio.py +550 -0
  3. prompts.py +63 -0
README.md CHANGED
@@ -3,9 +3,9 @@ title: Pathfinder
3
  emoji: πŸ”Ž
4
  colorFrom: yellow
5
  colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.37.0
8
- app_file: app.py
9
  pinned: true
10
  license: mit
11
  ---
 
3
  emoji: πŸ”Ž
4
  colorFrom: yellow
5
  colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.40.0
8
+ app_file: app_gradio.py
9
  pinned: true
10
  license: mit
11
  ---
app_gradio.py ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from abc import ABC, abstractmethod
4
+ from typing import List, Dict, Any, Tuple
5
+ from collections import defaultdict
6
+ import pandas as pd
7
+ from datetime import datetime, date
8
+ from datasets import load_dataset, load_from_disk
9
+ from collections import Counter
10
+
11
+ import yaml, json, requests, sys, os, time
12
+ import urllib.parse
13
+ import concurrent.futures
14
+
15
+ from langchain import hub
16
+ from langchain_openai import ChatOpenAI as openai_llm
17
+ from langchain_openai import OpenAIEmbeddings
18
+ from langchain_core.runnables import RunnableConfig, RunnablePassthrough, RunnableParallel
19
+ from langchain_core.prompts import PromptTemplate
20
+ from langchain_community.callbacks import StreamlitCallbackHandler
21
+ from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
22
+ from langchain_community.vectorstores import Chroma
23
+ from langchain_community.document_loaders import TextLoader
24
+ from langchain.agents import create_react_agent, Tool, AgentExecutor
25
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
26
+ from langchain_core.output_parsers import StrOutputParser
27
+ from langchain.callbacks import FileCallbackHandler
28
+ from langchain.callbacks.manager import CallbackManager
29
+ from langchain.schema import Document
30
+
31
+ import instructor
32
+ from pydantic import BaseModel, Field
33
+ from typing import List, Literal
34
+
35
+ from nltk.corpus import stopwords
36
+ import nltk
37
+ from openai import OpenAI
38
+ # import anthropic
39
+ import cohere
40
+ import faiss
41
+ import matplotlib.pyplot as plt
42
+ import spacy
43
+ from string import punctuation
44
+ import pytextrank
45
+ from prompts import *
46
+
47
+ openai_key = os.environ['openai_key']
48
+ cohere_key = os.environ['cohere_key']
49
+
50
+ def load_nlp():
51
+ nlp = spacy.load("en_core_web_sm")
52
+ nlp.add_pipe("textrank")
53
+ try:
54
+ stopwords.words('english')
55
+ except:
56
+ nltk.download('stopwords')
57
+ stopwords.words('english')
58
+ return nlp
59
+
60
+ gen_llm = openai_llm(temperature=0, model_name='gpt-4o-mini', openai_api_key = openai_key)
61
+ consensus_client = instructor.patch(OpenAI(api_key=openai_key))
62
+ embed_client = OpenAI(api_key = openai_key)
63
+ embed_model = "text-embedding-3-small"
64
+ embeddings = OpenAIEmbeddings(model = embed_model, api_key = openai_key)
65
+ nlp = load_nlp()
66
+
67
+
68
+ def get_keywords(text, nlp=nlp):
69
+ result = []
70
+ pos_tag = ['PROPN', 'ADJ', 'NOUN']
71
+ doc = nlp(text.lower())
72
+ for token in doc:
73
+ if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
74
+ continue
75
+ if(token.pos_ in pos_tag):
76
+ result.append(token.text)
77
+ return result
78
+
79
+ def load_arxiv_corpus():
80
+ arxiv_corpus = load_from_disk('data/')
81
+ arxiv_corpus.load_faiss_index('embed', 'data/astrophindex.faiss')
82
+ print('loading arxiv corpus from disk')
83
+ return arxiv_corpus
84
+
85
+ class RetrievalSystem():
86
+
87
+ def __init__(self):
88
+
89
+ self.dataset = arxiv_corpus
90
+ self.client = OpenAI(api_key = openai_key)
91
+ self.embed_model = "text-embedding-3-small"
92
+ self.generation_client = openai_llm(temperature=0,model_name='gpt-4o-mini', openai_api_key = openai_key)
93
+ self.hyde_client = openai_llm(temperature=0.5,model_name='gpt-4o-mini', openai_api_key = openai_key)
94
+ self.cohere_client = cohere.Client(cohere_key)
95
+
96
+ def make_embedding(self, text):
97
+ str_embed = self.client.embeddings.create(input = [text], model = self.embed_model).data[0].embedding
98
+ return str_embed
99
+
100
+ def embed_batch(self, texts: List[str]) -> List[np.ndarray]:
101
+ embeddings = self.client.embeddings.create(input=texts, model=self.embed_model).data
102
+ return [np.array(embedding.embedding, dtype=np.float32) for embedding in embeddings]
103
+
104
+ def get_query_embedding(self, query):
105
+ return self.make_embedding(query)
106
+
107
+ def calc_faiss(self, query_embedding, top_k = 100):
108
+ # xq = query_embedding.reshape(-1,1).T.astype('float32')
109
+ # D, I = self.index.search(xq, top_k)
110
+ # return I[0], D[0]
111
+ tmp = self.dataset.search('embed', query_embedding, k=top_k)
112
+ return [tmp.indices, tmp.scores, self.dataset[tmp.indices]]
113
+
114
+ def rank_and_filter(self, query, query_embedding, top_k = 10, top_k_internal = 1000, return_scores=False):
115
+
116
+ if 'Keywords' in self.toggles:
117
+ self.weight_keywords = True
118
+ else:
119
+ self.weight_keywords = False
120
+
121
+ if 'Time' in self.toggles:
122
+ self.weight_date = True
123
+ else:
124
+ self.weight_date = False
125
+
126
+ if 'Citations' in self.toggles:
127
+ self.weight_citation = True
128
+ else:
129
+ self.weight_citation = False
130
+
131
+ topk_indices, similarities, small_corpus = self.calc_faiss(np.array(query_embedding), top_k = top_k_internal)
132
+ similarities = 1/similarities # converting from a distance (less is better) to a similarity (more is better)
133
+
134
+ if self.weight_keywords == True:
135
+
136
+ query_kws = get_keywords(query)
137
+ input_kws = self.query_input_keywords
138
+ query_kws = query_kws + input_kws
139
+ self.query_kws = query_kws
140
+ sub_kws = [small_corpus['keywords'][i] for i in range(top_k_internal)]
141
+ kw_weight = np.zeros((len(topk_indices),)) + 0.1
142
+
143
+ for k in query_kws:
144
+ for i in (range(len(topk_indices))):
145
+ for j in range(len(sub_kws[i])):
146
+ if k.lower() in sub_kws[i][j].lower():
147
+ kw_weight[i] = kw_weight[i] + 0.1
148
+ # print(i, k, sub_kws[i][j])
149
+
150
+ # kw_weight = kw_weight**0.36 / np.amax(kw_weight**0.36)
151
+ kw_weight = kw_weight / np.amax(kw_weight)
152
+ else:
153
+ kw_weight = np.ones((len(topk_indices),))
154
+
155
+ if self.weight_date == True:
156
+ sub_dates = [small_corpus['date'][i] for i in range(top_k_internal)]
157
+ date = datetime.now().date()
158
+ date_diff = np.array([((date - i).days / 365.) for i in sub_dates])
159
+ # age_weight = (1 + np.exp(date_diff/2.1))**(-1) + 0.5
160
+ age_weight = (1 + np.exp(date_diff/0.7))**(-1)
161
+ age_weight = age_weight / np.amax(age_weight)
162
+ else:
163
+ age_weight = np.ones((len(topk_indices),))
164
+
165
+ if self.weight_citation == True:
166
+ # st.write('weighting by citations')
167
+ sub_cites = np.array([small_corpus['cites'][i] for i in range(top_k_internal)])
168
+ temp = sub_cites.copy()
169
+ temp[sub_cites > 300] = 300.
170
+ cite_weight = (1 + np.exp((300-temp)/42.0))**(-1.)
171
+ cite_weight = cite_weight / np.amax(cite_weight)
172
+ else:
173
+ cite_weight = np.ones((len(topk_indices),))
174
+
175
+ similarities = similarities * (kw_weight) * (age_weight) * (cite_weight)
176
+
177
+ filtered_results = [[topk_indices[i], similarities[i]] for i in range(len(similarities))]
178
+ top_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)[:top_k]
179
+
180
+ top_scores = [doc[1] for doc in top_results]
181
+ top_indices = [doc[0] for doc in top_results]
182
+ small_df = self.dataset[top_indices]
183
+
184
+ if return_scores:
185
+ return {doc[0]: doc[1] for doc in top_results}, small_df
186
+
187
+ # Only keep the document IDs
188
+ top_results = [doc[0] for doc in top_results]
189
+ return top_results, small_df
190
+
191
+ def generate_doc(self, query: str):
192
+ prompt = """You are an expert astronomer. Given a scientific query, generate the abstract of an expert-level research paper
193
+ that answers the question. Stick to a maximum length of {} tokens and return just the text of the abstract and conclusion.
194
+ Do not include labels for any section. Use research-specific jargon.""".format(self.max_doclen)
195
+
196
+ messages = [("system",prompt,),("human", query),]
197
+ return self.hyde_client.invoke(messages).content
198
+
199
+ def generate_docs(self, query: str):
200
+ docs = []
201
+ for i in range(self.generate_n):
202
+ docs.append(self.generate_doc(query))
203
+ return docs
204
+
205
+ def embed_docs(self, docs: List[str]):
206
+ return self.embed_batch(docs)
207
+
208
+ def retrieve(self, query, top_k, return_scores = False,
209
+ embed_query=True, max_doclen=250,
210
+ generate_n=1, temperature=0.5,
211
+ rerank_top_k = 250):
212
+
213
+ if max_doclen * generate_n > 8191:
214
+ raise ValueError("Too many tokens. Please reduce max_doclen or generate_n.")
215
+
216
+ query_embedding = self.get_query_embedding(query)
217
+
218
+ if self.hyde == True:
219
+ self.max_doclen = max_doclen
220
+ self.generate_n = generate_n
221
+ self.hyde_client.temperature = temperature
222
+ self.embed_query = embed_query
223
+ docs = self.generate_docs(query)
224
+ # st.expander('Abstract generated with hyde', expanded=False).write(docs)
225
+ doc_embeddings = self.embed_docs(docs)
226
+ if self.embed_query:
227
+ query_emb = self.embed_docs([query])[0]
228
+ doc_embeddings.append(query_emb)
229
+ query_embedding = np.mean(np.array(doc_embeddings), axis = 0)
230
+
231
+ if self.rerank == True:
232
+ top_results, small_df = self.rank_and_filter(query,
233
+ query_embedding,
234
+ rerank_top_k,
235
+ return_scores = False)
236
+ # try:
237
+ docs_for_rerank = [small_df['abstract'][i] for i in range(rerank_top_k)]
238
+ if len(docs_for_rerank) == 0:
239
+ return []
240
+ reranked_results = self.cohere_client.rerank(
241
+ query=query,
242
+ documents=docs_for_rerank,
243
+ model='rerank-english-v3.0',
244
+ top_n=top_k
245
+ )
246
+ final_results = []
247
+ for result in reranked_results.results:
248
+ doc_id = top_results[result.index]
249
+ doc_text = docs_for_rerank[result.index]
250
+ score = float(result.relevance_score)
251
+ final_results.append([doc_id, "", score])
252
+ final_indices = [doc[0] for doc in final_results]
253
+ if return_scores:
254
+ return {result[0]: result[2] for result in final_results}, self.dataset[final_indices]
255
+ return [doc[0] for doc in final_results], self.dataset[final_indices]
256
+ # except:
257
+ # print('heavy load, please wait 10s and try again.')
258
+ else:
259
+ top_results, small_df = self.rank_and_filter(query,
260
+ query_embedding,
261
+ top_k,
262
+ return_scores = return_scores)
263
+
264
+ return top_results, small_df
265
+
266
+ def return_formatted_df(self, top_results, small_df):
267
+
268
+ df = pd.DataFrame(small_df)
269
+ df = df.drop(columns=['umap_x','umap_y','cite_bibcodes','ref_bibcodes'])
270
+ links = ['['+i+'](https://ui.adsabs.harvard.edu/abs/'+i+'/abstract)' for i in small_df['bibcode']]
271
+
272
+ # st.write(top_results[0:10])
273
+ scores = [top_results[i] for i in top_results]
274
+ indices = [i for i in top_results]
275
+ df.insert(1,'ADS Link',links,True)
276
+ df.insert(2,'Relevance',scores,True)
277
+ df.insert(3,'indices',indices,True)
278
+ df = df[['ADS Link','Relevance','date','cites','title','authors','abstract','keywords','ads_id','indices','embed']]
279
+ df.index += 1
280
+ return df
281
+
282
+ arxiv_corpus = load_arxiv_corpus()
283
+ ec = RetrievalSystem()
284
+ print('loaded retrieval system')
285
+
286
+ def Library(papers_df):
287
+ op_docs = ''
288
+ for i in range(len(papers_df)):
289
+ op_docs = op_docs + 'Paper %.0f:' %(i+1) + papers_df['title'][i+1] + '\n' + papers_df['abstract'][i+1] + '\n\n'
290
+
291
+ return op_docs
292
+
293
+ def run_rag_qa(query, papers_df, question_type):
294
+
295
+ loaders = []
296
+
297
+ documents = []
298
+
299
+ for i, row in papers_df.iterrows():
300
+ content = f"Paper {i+1}: {row['title']}\n{row['abstract']}\n\n"
301
+ metadata = {"source": row['ads_id']}
302
+ doc = Document(page_content=content, metadata=metadata)
303
+ documents.append(doc)
304
+
305
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50, add_start_index=True)
306
+ splits = text_splitter.split_documents(documents)
307
+ vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, collection_name='retdoc4')
308
+ retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
309
+
310
+ if question_type == 'Bibliometric':
311
+ template = bibliometric_prompt
312
+ elif question_type == 'Single-paper':
313
+ template = single_paper_prompt
314
+ elif question_type == 'Broad but nuanced':
315
+ template = deep_knowledge_prompt
316
+ else:
317
+ template = regular_prompt
318
+ prompt = PromptTemplate.from_template(template)
319
+
320
+ def format_docs(docs):
321
+ return "\n\n".join(doc.page_content for doc in docs)
322
+
323
+ rag_chain_from_docs = (
324
+ RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
325
+ | prompt
326
+ | gen_llm
327
+ | StrOutputParser()
328
+ )
329
+
330
+ rag_chain_with_source = RunnableParallel(
331
+ {"context": retriever, "question": RunnablePassthrough()}
332
+ ).assign(answer=rag_chain_from_docs)
333
+ rag_answer = rag_chain_with_source.invoke(query, )
334
+ vectorstore.delete_collection()
335
+
336
+ # except:
337
+ # st.subheader('heavy load! please wait 10 seconds and try again.')
338
+
339
+ return rag_answer
340
+
341
+ def guess_question_type(query: str):
342
+
343
+ gen_client = openai_llm(temperature=0,model_name='gpt-4o-mini', openai_api_key = openai_key)
344
+ messages = [("system",question_categorization_prompt,),("human", query),]
345
+ return gen_client.invoke(messages).content
346
+
347
+ class OverallConsensusEvaluation(BaseModel):
348
+ rewritten_statement: str = Field(
349
+ ...,
350
+ description="The query rewritten as a statement if it was initially a question"
351
+ )
352
+ consensus: Literal[
353
+ "Strong Agreement Between Abstracts and Query",
354
+ "Moderate Agreement Between Abstracts and Query",
355
+ "Weak Agreement Between Abstracts and Query",
356
+ "No Clear Agreement/Disagreement Between Abstracts and Query",
357
+ "Weak Disagreement Between Abstracts and Query",
358
+ "Moderate Disagreement Between Abstracts and Query",
359
+ "Strong Disagreement Between Abstracts and Query"
360
+ ] = Field(
361
+ ...,
362
+ description="The overall level of consensus between the rewritten statement and the abstracts"
363
+ )
364
+ explanation: str = Field(
365
+ ...,
366
+ description="A detailed explanation of the consensus evaluation (maximum six sentences)"
367
+ )
368
+ relevance_score: float = Field(
369
+ ...,
370
+ description="A score from 0 to 1 indicating how relevant the abstracts are to the query overall",
371
+ ge=0,
372
+ le=1
373
+ )
374
+
375
+ def evaluate_overall_consensus(query: str, abstracts: List[str]) -> OverallConsensusEvaluation:
376
+ prompt = f"""
377
+ Query: {query}
378
+ You will be provided with {len(abstracts)} scientific abstracts. Your task is to do the following:
379
+ 1. If the provided query is a question, rewrite it as a statement. This statement does not have to be true. Output this as 'Rewritten Statement:'.
380
+ 2. Evaluate the overall consensus between the rewritten statement and the abstracts using one of the following levels:
381
+ - Strong Agreement Between Abstracts and Query
382
+ - Moderate Agreement Between Abstracts and Query
383
+ - Weak Agreement Between Abstracts and Query
384
+ - No Clear Agreement/Disagreement Between Abstracts and Query
385
+ - Weak Disagreement Between Abstracts and Query
386
+ - Moderate Disagreement Between Abstracts and Query
387
+ - Strong Disagreement Between Abstracts and Query
388
+ Output this as 'Consensus:'
389
+ 3. Provide a detailed explanation of your consensus evaluation in maximum six sentences. Output this as 'Explanation:'
390
+ 4. Assign a relevance score as a float between 0 to 1, where:
391
+ - 1.0: Perfect match in content and quality
392
+ - 0.8-0.9: Excellent, with minor differences
393
+ - 0.6-0.7: Good, captures main points but misses some details
394
+ - 0.4-0.5: Fair, partially relevant but significant gaps
395
+ - 0.2-0.3: Poor, major inaccuracies or omissions
396
+ - 0.0-0.1: Completely irrelevant or incorrect
397
+ Output this as 'Relevance Score:'
398
+ Here are the abstracts:
399
+ {' '.join([f"Abstract {i+1}: {abstract}" for i, abstract in enumerate(abstracts)])}
400
+ Provide your evaluation in the structured format described above.
401
+ """
402
+
403
+ response = consensus_client.chat.completions.create(
404
+ model="gpt-4o-mini", # used to be "gpt-4",
405
+ response_model=OverallConsensusEvaluation,
406
+ messages=[
407
+ {"role": "system", "content": """You are an assistant with expertise in astrophysics for question-answering tasks.
408
+ Evaluate the overall consensus of the retrieved scientific abstracts in relation to a given query.
409
+ If you don't know the answer, just say that you don't know.
410
+ Use six sentences maximum and keep the answer concise."""},
411
+ {"role": "user", "content": prompt}
412
+ ],
413
+ temperature=0
414
+ )
415
+
416
+ return response
417
+
418
+ def calc_outlier_flag(papers_df, top_k, cutoff_adjust = 0.1):
419
+
420
+ cut_dist = np.load('pfdr_arxiv_cutoff_distances.npy') - cutoff_adjust
421
+ pts = np.array(papers_df['embed'].tolist())
422
+ centroid = np.mean(pts,0)
423
+ dists = np.sqrt(np.sum((pts-centroid)**2,1))
424
+ outlier_flag = (dists > cut_dist[top_k-1])
425
+
426
+ return outlier_flag
427
+
428
+ def make_embedding_plot(papers_df, top_k, consensus_answer, arxiv_corpus=arxiv_corpus):
429
+
430
+ plt_indices = np.array(papers_df['indices'].tolist())
431
+
432
+ xax = np.array(arxiv_corpus['umap_x'])
433
+ yax = np.array(arxiv_corpus['umap_y'])
434
+
435
+ outlier_flag = calc_outlier_flag(papers_df, top_k, cutoff_adjust=0.25)
436
+ alphas = np.ones((len(plt_indices),)) * 0.9
437
+ alphas[outlier_flag] = 0.5
438
+
439
+ fig = plt.figure(figsize=(9*1.8,12*1.8))
440
+ plt.scatter(xax,yax, s=1, alpha=0.01, c='k')
441
+
442
+ clkws = np.load('kw_tags.npz')
443
+ all_x, all_y, all_topics, repeat_flag = clkws['all_x'], clkws['all_y'], clkws['all_topics'], clkws['repeat_flag']
444
+ for i in range(len(all_topics)):
445
+ if repeat_flag[i] == False:
446
+ plt.text(all_x[i], all_y[i], all_topics[i],fontsize=9,ha="center", va="center",
447
+ bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3',alpha=0.81))
448
+ plt.scatter(xax[plt_indices], yax[plt_indices], s=300*alphas**2, alpha=alphas, c='w',zorder=1000)
449
+ plt.scatter(xax[plt_indices], yax[plt_indices], s=100*alphas**2, alpha=alphas, c='dodgerblue',zorder=1001)
450
+ # plt.scatter(xax[plt_indices][outlier_flag], yax[plt_indices][outlier_flag], s=100, alpha=1., c='firebrick')
451
+ plt.axis([0,20,-4.2,18])
452
+ plt.axis('off')
453
+ return fig
454
+
455
+ def run_pathfinder(query, top_k, extra_keywords, toggles, prompt_type, rag_type, ec=ec, progress=gr.Progress()):
456
+
457
+ yield None, None, None, None, None
458
+
459
+ search_text_list = ['rooting around in the paper pile...','looking for clarity...','scanning the event horizon...','peering into the abyss...','potatoes power this ongoing search...']
460
+ gen_text_list = ['making the LLM talk to the papers...','invoking arcane rituals...','gone to library, please wait...','is there really an answer to this...']
461
+
462
+ input_keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
463
+ query_keywords = get_keywords(query)
464
+ ec.query_input_keywords = input_keywords+query_keywords
465
+ ec.toggles = toggles
466
+ if rag_type == "Semantic Search":
467
+ ec.hyde = False
468
+ ec.rerank = False
469
+ elif rag_type == "Semantic + HyDE":
470
+ ec.hyde = True
471
+ ec.rerank = False
472
+ elif rag_type == "Semantic + HyDE + CoHERE":
473
+ ec.hyde = True
474
+ ec.rerank = True
475
+
476
+ progress(0.2, desc=search_text_list[np.random.choice(len(search_text_list))])
477
+ rs, small_df = ec.retrieve(query, top_k = top_k, return_scores=True)
478
+ formatted_df = ec.return_formatted_df(rs, small_df)
479
+ yield formatted_df, None, None, None, None
480
+
481
+ progress(0.4, desc=gen_text_list[np.random.choice(len(gen_text_list))])
482
+ rag_answer = run_rag_qa(query, formatted_df, prompt_type)
483
+ yield formatted_df, rag_answer['answer'], None, None, None
484
+
485
+ progress(0.6, desc="Generating consensus")
486
+ consensus_answer = evaluate_overall_consensus(query, [formatted_df['abstract'][i+1] for i in range(len(formatted_df))])
487
+ consensus = '## Consensus \n'+consensus_answer.consensus + '\n\n'+consensus_answer.explanation + '\n\n > Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score
488
+ yield formatted_df, rag_answer['answer'], consensus, None, None
489
+
490
+ progress(0.8, desc="Analyzing question type")
491
+ question_type_gen = guess_question_type(query)
492
+ if '<categorization>' in question_type_gen:
493
+ question_type_gen = question_type_gen.split('<categorization>')[1]
494
+ if '</categorization>' in question_type_gen:
495
+ question_type_gen = question_type_gen.split('</categorization>')[0]
496
+ question_type_gen = question_type_gen.replace('\n',' \n')
497
+ qn_type = question_type_gen
498
+ yield formatted_df, rag_answer['answer'], consensus, qn_type, None
499
+
500
+ progress(1.0, desc="Visualizing embeddings")
501
+ fig = make_embedding_plot(formatted_df, top_k, consensus_answer)
502
+
503
+ yield formatted_df, rag_answer['answer'], consensus, qn_type, fig
504
+
505
+ def create_interface():
506
+ custom_css = """
507
+ #custom-slider-* {
508
+ background-color: #ffffff;
509
+ }
510
+ """
511
+
512
+ with gr.Blocks(css=custom_css) as demo:
513
+
514
+ with gr.Tabs():
515
+ # with gr.Tab("What is Pathfinder?"):
516
+ # gr.Markdown(pathfinder_text)
517
+ with gr.Tab("pathfinder"):
518
+ with gr.Accordion("What is Pathfinder? / How do I use it?", open=False):
519
+ gr.Markdown(pathfinder_text)
520
+
521
+ with gr.Row():
522
+ query = gr.Textbox(label="Ask me anything")
523
+ with gr.Row():
524
+ with gr.Column(scale=1, min_width=300):
525
+ top_k = gr.Slider(1, 30, step=1, value=10, label="top-k", info="Number of papers to retrieve")
526
+ keywords = gr.Textbox(label="Optional Keywords (comma-separated)",value="")
527
+ toggles = gr.CheckboxGroup(["Keywords", "Time", "Citations"], label="Weight by", info="weighting retrieved papers",value=['Keywords'])
528
+ prompt_type = gr.Radio(choices=["Single-paper", "Multi-paper", "Bibliometric", "Broad but nuanced"], label="Prompt Specialization", value='Multi-paper')
529
+ rag_type = gr.Radio(choices=["Semantic Search", "Semantic + HyDE", "Semantic + HyDE + CoHERE"], label="RAG Method",value='Semantic + HyDE + CoHERE')
530
+ with gr.Column(scale=2, min_width=300):
531
+ img1 = gr.Image("local_files/pathfinder_logo.png")
532
+ btn = gr.Button("Run pfdr!")
533
+ # search_results_state = gr.State([])
534
+ ret_papers = gr.Dataframe(label='top-k retrieved papers', datatype='markdown')
535
+ search_results_state = gr.Markdown(label='Generated Answer')
536
+ qntype = gr.Markdown(label='Question type suggestion')
537
+ conc = gr.Markdown(label='Consensus')
538
+ plot = gr.Plot(label='top-k in embedding space')
539
+
540
+ inputs = [query, top_k, keywords, toggles, prompt_type, rag_type]
541
+ outputs = [ret_papers, search_results_state, qntype, conc, plot]
542
+ btn.click(fn=run_pathfinder, inputs=inputs, outputs=outputs)
543
+
544
+ return demo
545
+
546
+
547
+ if __name__ == "__main__":
548
+
549
+ pathfinder = create_interface()
550
+ pathfinder.launch()
prompts.py CHANGED
@@ -142,3 +142,66 @@ Present your final answer in the following format:
142
  Category: [Selected category]
143
  Explanation: [Your explanation for the categorization]
144
  </categorization>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  Category: [Selected category]
143
  Explanation: [Your explanation for the categorization]
144
  </categorization>"""
145
+
146
+
147
+ pathfinder_text = """# Welcome to Pathfinder
148
+
149
+ ## Discover the Universe Through AI-Powered Astronomy ReSearch
150
+
151
+ ### What is Pathfinder?
152
+
153
+ Pathfinder (https://pfdr.app) harnesses the power of modern large language models (LLMs) in combination with papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) to navigate the vast expanse of astronomy literature.
154
+ Our tool empowers researchers, students, and astronomy enthusiasts to get started on their journeys to find answers to complex research questions quickly and efficiently.
155
+
156
+ To use the old streamlit pathfinder (with the ReAct agent), you can use the [pfdr streamlit mirror](https://huggingface.co/spaces/kiyer/pathfinder_v3/).
157
+
158
+ This is not meant to be a replacement to existing tools like the [ADS](https://ui.adsabs.harvard.edu/), [arxivsorter](https://www.arxivsorter.org/), semantic search or google scholar, but rather a supplement to find papers that otherwise might be missed during a literature survey. It is trained on astro-ph papers up to July 2024.
159
+
160
+ ### How to Use Pathfinder
161
+
162
+ You can use pathfinder to find papers of interest with natural-language questions, and generate basic answers to questions using the retrieved papers. Try asking it questions like
163
+
164
+ - What is the value of the Hubble Constant?
165
+ - Are there open source radiative transfer codes for planetary atmospheres?
166
+ - Can I predict a galaxy spectrum from an image cutout? Please reply in Hindi.
167
+ - How would galaxy evolution differ in a universe with no dark matter?
168
+
169
+ **πŸ‘ˆ Use the sidebar to tweak the search parameters to get better results**. Changing the number of retrieved papers (**top-k**), weighting by keywords, time, or citations, or changing the prompt type might help better refine the paper search and synthesized answers for your specific question.
170
+
171
+ 1. **Enter Your Query**: Type your astronomy question in the search bar & hit `run pathfinder`.
172
+ 2. **Review Results**: Pathfinder will analyze relevant literature and present you with a concise answer.
173
+ 3. **Explore Further**: Click on provided links to delve deeper into the source material on ADS.
174
+ 4. **Refine Your Search**: Use our advanced filters to narrow down results by date, author, or topic.
175
+ 5. **Download results:** You can download the results of your query as a json file.
176
+
177
+ ### Why Use Pathfinder?
178
+
179
+ - **Time-Saving**: Get started finding answers that would take hours of manual research.
180
+ - **Comprehensive**: Access information from papers across a large database of astronomy literature.
181
+ - **User-Friendly**: Intuitive interface designed for researchers at all levels.
182
+ - **Constantly Updated**: Our database is regularly refreshed with the latest publications.
183
+
184
+ ### Learn More
185
+
186
+ - Read our paper on [arXiv](https://arxiv.org/abs/2408.01556) to understand the technology behind Pathfinder.
187
+ - Discover how Pathfinder was developed in collaboration with [UniverseTBD](https://www.universetbd.org) on its mission is to democratise science for everyone, and [JSALT](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/).
188
+
189
+ ---
190
+
191
+ ### Copyright and Terms of Use
192
+
193
+ Β© 2024 Pathfinder. All rights reserved.
194
+
195
+ Pathfinder is provided "as is" without warranty of any kind. By using this service, you agree to our [Terms of Service] and [Privacy Policy].
196
+
197
+ ### Contact Us
198
+
199
+ Have questions or feedback? We'd love to hear from you!
200
+ - Email: pfdr@universetbd.org
201
+ - Twitter: [@universe_tbd](https://twitter.com/universe_tbd)
202
+ - Huggingface: [https://huggingface.co/spaces/kiyer/pathfinder/](https://huggingface.co/spaces/kiyer/pathfinder/)
203
+
204
+ ---
205
+
206
+ *Empowering astronomical discoveries, one query at a time.*
207
+ """