Spaces:

kiyer
/

synthesist

Sleeping

App Files Files Community

kiyer commited on Jul 23, 2024

Commit

4351936

1 Parent(s): 022c0b9

linked up everything, qn type, consensus

Browse files

Files changed (5) hide show

app.py +658 -441
data/data-00000-of-00001.arrow +3 -0
data/dataset_info.json +134 -0
data/state.json +13 -0
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod
 from typing import List, Dict, Any, Tuple
 from collections import defaultdict
 from tqdm import tqdm
-import pandas as pd
 from datetime import datetime, date
 from datasets import load_dataset, load_from_disk
 from collections import Counter
@@ -16,24 +16,27 @@ import concurrent.futures
 from langchain import hub
 from langchain_openai import ChatOpenAI as openai_llm
-from langchain_core.runnables import RunnableConfig
 from langchain_community.callbacks import StreamlitCallbackHandler
-from langchain.agents import create_react_agent, Tool, AgentExecutor
 from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
-ts = time.time()
-anthropic_key = st.secrets["anthropic_key"]
-openai_key = st.secrets["openai_key"]
 from nltk.corpus import stopwords
 import nltk
 from openai import OpenAI
-import anthropic
 import cohere
 import faiss
@@ -50,12 +53,28 @@ except:
     nltk.download('stopwords')
     stopwords.words('english')
 from bokeh.plotting import figure
 from bokeh.models import ColumnDataSource
 from bokeh.io import output_notebook
 from bokeh.palettes import Spectral5
 from bokeh.transform import linear_cmap
 st.image('local_files/pathfinder_logo.png')
@@ -63,15 +82,13 @@ st.expander("About", expanded=False).write(
         """
         Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
         sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
         This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.
-        **👈 Select a tool from the sidebar** to see some examples
-        of what this framework can do!
         ### Tool summary:
         - Please wait while the initial data loads and compiles, this takes about a minute initially.
-        - `Paper search` looks for relevant papers given an arxiv id or a question.
         This is not meant to be a replacement to existing tools like the
         [ADS](https://ui.adsabs.harvard.edu/),
@@ -79,33 +96,34 @@ st.expander("About", expanded=False).write(
         that otherwise might be missed during a literature survey.
         It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
         if you are interested in extending it please reach out!
-        Also add: more pages, actual generation, diff. toggles for retrieval/gen, feedback form, socials, literature, contact us, copyright, collaboration, etc.
         The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
         using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
         atlas that shows well studied (forests) and currently uncharted areas (water).
         """
     )
 if 'arxiv_corpus' not in st.session_state:
     with st.spinner('loading data...'):
-        try:
             arxiv_corpus = load_from_disk('data/')
         except:
             st.write('downloading data')
-            arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
             arxiv_corpus.save_to_disk('data/')
         arxiv_corpus.add_faiss_index('embed')
         st.session_state.arxiv_corpus = arxiv_corpus
         st.toast('loaded arxiv corpus')
 else:
     arxiv_corpus = st.session_state.arxiv_corpus
 if 'ids' not in st.session_state:
     st.session_state.ids = arxiv_corpus['ads_id']
     st.session_state.titles = arxiv_corpus['title']
@@ -114,8 +132,8 @@ if 'ids' not in st.session_state:
     st.session_state.years = arxiv_corpus['date']
     st.session_state.kws = arxiv_corpus['keywords']
     st.toast('done caching. time taken: %.2f sec' %(time.time()-ts))
 #---------------------------------------------------------------
 # A hack to "clear" the previous result when submitting a new prompt. This avoids
@@ -144,186 +162,33 @@ def with_clear_container(submit_clicked: bool) -> bool:
         return True
     return False
-#----------------------------------------------------------------
-class Filter():
-    def filter(self, query: str, arxiv_id: str) -> List[str]:
-        pass
-class CitationFilter(Filter): # can do it with all metadata
-    def __init__(self, corpus):
-        self.corpus = corpus
-        ids = ids
-        cites = cites
-        self.citation_counts = {ids[i]: cites[i] for i in range(len(ids))}
-    def citation_weight(self, x, shift, scale):
-        return 1 / (1 + np.exp(-1 * (x - shift) / scale)) # sigmoid function
-    def filter(self, doc_scores, weight = 0.1): # additive weighting
-        citation_count = np.array([self.citation_counts[doc[0]] for doc in doc_scores])
-        cmean, cstd = np.median(citation_count), np.std(citation_count)
-        citation_score = self.citation_weight(citation_count, cmean, cstd)
-        for i, doc in enumerate(doc_scores):
-            doc_scores[i][2] += weight * citation_score[i]
-class DateFilter(Filter): # include time weighting eventually
-    def __init__(self, document_dates):
-        self.document_dates = document_dates
-    def parse_date(self, arxiv_id: str) -> datetime: # only for documents
-        if arxiv_id.startswith('astro-ph'):
-            arxiv_id = arxiv_id.split('astro-ph')[1].split('_arXiv')[0]
-        try:
-            year = int("20" + arxiv_id[:2])
-            month = int(arxiv_id[2:4])
-        except:
-            year = 2023
-            month = 1
-        return date(year, month, 1)
-    def weight(self, time, shift, scale):
-        return 1 / (1 + np.exp((time - shift) / scale))
-    def evaluate_filter(self, year, filter_string):
-        try:
-            # Use ast.literal_eval to safely evaluate the expression
-            result = eval(filter_string, {"__builtins__": None}, {"year": year})
-            return result
-        except Exception as e:
-            print(f"Error evaluating filter: {e}")
-            return False
-    def filter(self, docs, boolean_date = None, min_date = None, max_date = None, time_score = 0):
-        filtered = []
-        if boolean_date is not None:
-            boolean_date = boolean_date.replace("AND", "and").replace("OR", "or")
-            for doc in docs:
-                if self.evaluate_filter(self.document_dates[doc[0]].year, boolean_date):
-                    filtered.append(doc)
-        else:
-            if min_date == None: min_date = date(1990, 1, 1)
-            if max_date == None: max_date = date(2024, 7, 3)
-            for doc in docs:
-                if self.document_dates[doc[0]] >= min_date and self.document_dates[doc[0]] <= max_date:
-                    filtered.append(doc)
-        if time_score is not None: # apply time weighting
-            for i, item in enumerate(filtered):
-                time_diff = (max_date - self.document_dates[filtered[i][0]]).days / 365
-                filtered[i][2] += time_score * 0.1 * self.weight(time_diff, 5, 5)
-        return filtered
-class KeywordFilter(Filter):
-    def __init__(self, corpus,
-                 remove_capitals: bool = True, metadata = None, ne_only = True, verbose = False):
-        self.index_path = 'keyword_index.json'
-        # self.metadata = metadata
-        self.remove_capitals = remove_capitals
-        self.ne_only = ne_only
-        self.stopwords = set(stopwords.words('english'))
-        self.verbose = verbose
-        self.index = None
-        self.kws = st.session_state.kws
-        self.ids = st.session_state.ids
-        self.titles = st.session_state.titles
-        self.load_or_build_index()
-    def preprocess_text(self, text: str) -> str:
-        text = ''.join(char for char in text if char.isalnum() or char.isspace())
-        if self.remove_capitals: text = text.lower()
-        return ' '.join(word for word in text.split() if word.lower() not in self.stopwords)
-    def build_index(self): # include the title in the index
-        print("Building index...")
-        self.index = {}
-        for i in range(len(self.kws)):
-            paper = self.ids[i]
-            title = self.titles[i]
-            title_keywords = set()
-            for keyword in set(self.kws[i]) | title_keywords:
-                term = ' '.join(word for word in keyword.lower().split() if word.lower() not in self.stopwords)
-                if term not in self.index:
-                    self.index[term] = []
-                self.index[term].append(self.ids[i])
-        with open(self.index_path, 'w') as f:
-            json.dump(self.index, f)
-    def load_index(self):
-        print("Loading existing index...")
-        with open(self.index_path, 'rb') as f:
-            self.index = json.load(f)
-        print("Index loaded successfully.")
-    def load_or_build_index(self):
-        if os.path.exists(self.index_path):
-            self.load_index()
-        else:
-            self.build_index()
-    def parse_doc(self, doc):
-        local_kws = []
-        for phrase in doc._.phrases:
-            local_kws.append(phrase.text.lower())
-        return [self.preprocess_text(word) for word in local_kws]
-    def get_propn(self, doc):
-        result = []
-        working_str = ''
-        for token in doc:
-            if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
-                if working_str != '':
-                    result.append(working_str.strip())
-                    working_str = ''
-            if(token.pos_ == "PROPN"):
-                working_str += token.text + ' '
-        if working_str != '': result.append(working_str.strip())
-        return [self.preprocess_text(word) for word in result]
-    def filter(self, query: str, doc_ids = None):
-        doc = nlp(query)
-        query_keywords = self.parse_doc(doc)
-        nouns = self.get_propn(doc)
-        if self.verbose: print('keywords:', query_keywords)
-        if self.verbose: print('proper nouns:', nouns)
-        filtered = set()
-        if len(query_keywords) > 0 and not self.ne_only:
-            for keyword in query_keywords:
-                if keyword != '' and keyword in self.index.keys(): filtered |= set(self.index[keyword])
-        if len(nouns) > 0:
-            ne_results = set()
-            for noun in nouns:
-                if noun in self.index.keys(): ne_results |= set(self.index[noun])
-            if self.ne_only: filtered = ne_results # keep only named entity results
-            else: filtered &= ne_results # take the intersection
-        if doc_ids is not None: filtered &= doc_ids # apply filter to results
-        return filtered
 class EmbeddingRetrievalSystem():
     def __init__(self, weight_citation = False, weight_date = False, weight_keywords = False):
         self.ids = st.session_state.ids
         self.years = st.session_state.years
         self.abstract = st.session_state.abstracts
@@ -331,7 +196,8 @@ class EmbeddingRetrievalSystem():
         self.embed_model = "text-embedding-3-small"
         self.dataset = arxiv_corpus
         self.kws = st.session_state.kws
         self.weight_citation = weight_citation
         self.weight_date = weight_date
         self.weight_keywords = weight_keywords
@@ -339,7 +205,7 @@ class EmbeddingRetrievalSystem():
         # self.citation_filter = CitationFilter(self.dataset)
         # self.date_filter = DateFilter(self.dataset['date'])
-        self.keyword_filter = KeywordFilter(corpus=self.dataset, remove_capitals=True)
     def parse_date(self, id):
         # indexval = np.where(self.ids == id)[0][0]
@@ -354,12 +220,6 @@ class EmbeddingRetrievalSystem():
         embeddings = self.client.embeddings.create(input=texts, model=self.embed_model).data
         return [np.array(embedding.embedding, dtype=np.float32) for embedding in embeddings]
-    def init_filters(self):
-        self.citation_filter = []
-        self.date_filter = []
-        self.keyword_filter = []
     def get_query_embedding(self, query):
         return self.make_embedding(query)
@@ -370,22 +230,77 @@ class EmbeddingRetrievalSystem():
         # xq = query_embedding.reshape(-1,1).T.astype('float32')
         # D, I = self.index.search(xq, top_k)
         # return I[0], D[0]
-        tmp = self.dataset.search('embed',query_embedding, k=top_k)
         return [tmp.indices, tmp.scores]
     def rank_and_filter(self, query, query_embedding, query_date, top_k = 10, return_scores=False, time_result=None):
-        topk_indices, similarities = self.calc_faiss(np.array(query_embedding), top_k = 300)
-        if self.weight_keywords:
-            keyword_matches = self.keyword_filter.filter(query)
-            kw_indices = np.zeros_like(similarities)
-            for s in keyword_matches:
-                if self.id_to_index[s] in topk_indices:
-                    # print('yes', self.id_to_index[s], topk_indices[np.where(topk_indices == self.id_to_index[s])[0]])
-                    similarities[np.where(topk_indices == self.id_to_index[s])[0]] = similarities[np.where(topk_indices == self.id_to_index[s])[0]] * 10.
-            similarities = similarities / 10.
         filtered_results = [[topk_indices[i], similarities[i]] for i in range(len(similarities))]
         top_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)[:top_k]
@@ -395,43 +310,43 @@ class EmbeddingRetrievalSystem():
         # Only keep the document IDs
         top_results = [doc[0] for doc in top_results]
-        return top_results
     def retrieve(self, query, top_k, time_result=None, query_date = None, return_scores = False):
         query_embedding = self.get_query_embedding(query)
         # Judge time relevance
         if time_result is None:
-            if self.weight_date:
                 time_result, time_taken = self.analyze_temporal_query(query, self.anthropic_client)
-            else:
                 time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
-        top_results = self.rank_and_filter(query,
-                                           query_embedding,
-                                           query_date,
-                                           top_k,
-                                           return_scores = return_scores,
                                            time_result = time_result)
         return top_results
 class HydeRetrievalSystem(EmbeddingRetrievalSystem):
-    def __init__(self, generation_model: str = "claude-3-haiku-20240307",
-                 embedding_model: str = "text-embedding-3-small",
-             temperature: float = 0.5,
-                 max_doclen: int = 500,
-                 generate_n: int = 1,
-                 embed_query = True,
                  conclusion = False, **kwargs):
         # Handle the kwargs for the superclass init -- filters/citation weighting
         super().__init__(**kwargs)
         if max_doclen * generate_n > 8191:
             raise ValueError("Too many tokens. Please reduce max_doclen or generate_n.")
         self.embedding_model = embedding_model
         self.generation_model = generation_model
@@ -442,58 +357,67 @@ class HydeRetrievalSystem(EmbeddingRetrievalSystem):
         self.embed_query = embed_query # embed the query vector?
         self.conclusion = conclusion # generate conclusion as well?
-        self.anthropic_key = anthropic_key
-        self.generation_client = anthropic.Anthropic(api_key = self.anthropic_key)
     def retrieve(self, query: str, top_k: int = 10, return_scores = False, time_result = None) -> List[Tuple[str, str, float]]:
         if time_result is None:
             if self.weight_date: time_result, time_taken = analyze_temporal_query(query, self.anthropic_client)
             else: time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
         docs = self.generate_docs(query)
         doc_embeddings = self.embed_docs(docs)
-        if self.embed_query:
             query_emb = self.embed_docs([query])[0]
             doc_embeddings.append(query_emb)
         embedding = np.mean(np.array(doc_embeddings), axis = 0)
         top_results = self.rank_and_filter(query, embedding, query_date=None, top_k = top_k, return_scores = return_scores, time_result = time_result)
         return top_results
     def generate_doc(self, query: str):
-        prompt = """You are an expert astronomer. Given a scientific query, generate the abstract"""
-        if self.conclusion:
-            prompt += " and conclusion"
-        prompt += """ of an expert-level research paper
                             that answers the question. Stick to a maximum length of {} tokens and return just the text of the abstract and conclusion.
                             Do not include labels for any section. Use research-specific jargon.""".format(self.max_doclen)
-        message = self.generation_client.messages.create(
-                model = self.generation_model,
-                max_tokens = self.max_doclen,
-                temperature = self.temperature,
-                system = prompt,
-                messages=[{ "role": "user",
-                        "content": [{"type": "text", "text": query,}] }]
-            )
-        return message.content[0].text
     def generate_docs(self, query: str):
         docs = []
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            future_to_query = {executor.submit(self.generate_doc, query): query for i in range(self.generate_n)}
-            for future in concurrent.futures.as_completed(future_to_query):
-                query = future_to_query[future]
-                try:
-                    data = future.result()
-                    docs.append(data)
-                except Exception as exc:
-                    pass
         return docs
     def embed_docs(self, docs: List[str]):
@@ -503,35 +427,35 @@ class HydeCohereRetrievalSystem(HydeRetrievalSystem):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.cohere_key = "Of1MjzFjGmvzBAqdvNHTQLkAjecPcOKpiIPAnFMn"
         self.cohere_client = cohere.Client(self.cohere_key)
-    def retrieve(self, query: str,
-                 top_k: int = 10,
                  rerank_top_k: int = 250,
                  return_scores = False, time_result = None,
                  reweight = False) -> List[Tuple[str, str, float]]:
         if time_result is None:
             if self.weight_date: time_result, time_taken = analyze_temporal_query(query, self.anthropic_client)
             else: time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
         top_results = super().retrieve(query, top_k = rerank_top_k, time_result = time_result)
         # doc_texts = self.get_document_texts(top_results)
         # docs_for_rerank = [f"Abstract: {doc['abstract']}\nConclusions: {doc['conclusions']}" for doc in doc_texts]
         docs_for_rerank = [self.abstract[i] for i in top_results]
         if len(docs_for_rerank) == 0:
             return []
         reranked_results = self.cohere_client.rerank(
             query=query,
             documents=docs_for_rerank,
             model='rerank-english-v3.0',
             top_n=top_k
         )
         final_results = []
         for result in reranked_results.results:
             doc_id = top_results[result.index]
@@ -542,9 +466,9 @@ class HydeCohereRetrievalSystem(HydeRetrievalSystem):
         if reweight:
             if time_result['has_temporal_aspect']:
                 final_results = self.date_filter.filter(final_results, time_score = time_result['expected_recency_weight'])
             if self.weight_citation: self.citation_filter.filter(final_results)
         if return_scores:
             return {result[0]: result[2] for result in final_results}
@@ -554,40 +478,113 @@ class HydeCohereRetrievalSystem(HydeRetrievalSystem):
         return self.embed_batch(docs)
 # ----------------------------------------------------------------
 if 'ec' not in st.session_state:
-    ec = EmbeddingRetrievalSystem(weight_keywords=True)
     st.session_state.ec = ec
     st.toast('loaded retrieval system')
 else:
     ec = st.session_state.ec
-# Function to simulate question answering (replace with actual implementation)
-def answer_question(question, top_k, keywords, toggles, method, question_type):
-    # Simulated answer (replace with actual logic)
-    # return f"Answer to '{question}' using method {method} for {question_type} question."
-    return run_ret(question, top_k)
-def get_papers(ids):
-    papers, scores, links = [], [], []
     for i in ids:
         papers.append(st.session_state.titles[i])
         scores.append(ids[i])
         links.append('https://ui.adsabs.harvard.edu/abs/'+st.session_state.arxiv_corpus['bibcode'][i]+'/abstract')
     return pd.DataFrame({
         'Title': papers,
         'Relevance': scores,
-        'Link': links
     })
 def create_embedding_plot(rs):
     pltsource = ColumnDataSource(data=dict(
         x=st.session_state.arxiv_corpus['umap_x'],
@@ -595,10 +592,14 @@ def create_embedding_plot(rs):
         title=st.session_state.titles,
         link=st.session_state.arxiv_corpus['bibcode'],
     ))
     rsflag = np.zeros((len(st.session_state.ids),))
     rsflag[np.array([k for k in rs])] = 1
     pltsource.data['colors'] = rsflag * 0.8 + 0.1
     pltsource.data['sizes'] = (rsflag + 1)**5 / 100
     TOOLTIPS = """
@@ -609,22 +610,21 @@ def create_embedding_plot(rs):
     @link <br> <br>
     </div>
     """
     mapper = linear_cmap(field_name="colors", palette=Spectral5, low=0., high=1.)
     p = figure(width=700, height=900, tooltips=TOOLTIPS, x_range=(0, 20), y_range=(-4.2,18),
             title="UMAP projection of embeddings for the astro-ph corpus")
     p.axis.visible=False
     p.grid.visible=False
     p.outline_line_alpha = 0.
     p.circle('x', 'y', radius='sizes', source=pltsource, alpha=0.3, fill_color=mapper, fill_alpha='colors', line_color="lightgrey",line_alpha=0.1)
     return p
-# Function to simulate keyword extraction (replace with actual implementation)
-def extract_keywords(question):
     # Simulated keyword extraction (replace with actual logic)
     return ['keyword1', 'keyword2', 'keyword3']
@@ -633,184 +633,401 @@ def estimate_consensus():
     # Simulated consensus estimation (replace with actual calculation)
     return 0.75
-def run_ret(query, top_k):
-    rs = ec.retrieve(query, top_k, return_scores=True)
-    output_str = ''
-    for i in rs:
-        if rs[i] > 0.5:
-            output_str = output_str + '---> ' + st.session_state.abstracts[i] + '(score: %.2f) \n' %rs[i]
-        else:
-            output_str = output_str + st.session_state.abstracts[i] + '(score: %.2f) \n' %rs[i]
-    return output_str, rs
-def Library(query, top_k=7):
-    rs = ec.retrieve(query, top_k, return_scores=True)
-    op_docs = ''
-    for paperno, i in enumerate(rs):
-        # op_docs.append(abstracts[i])
-        op_docs = op_docs + 'Paper %.0f:' %(paperno+1) +' (published in '+st.session_state.arxiv_corpus['bibcode'][i][0:4] + ') ' + st.session_state.titles[i]  + '\n' + st.session_state.abstracts[i] + '\n\n'
-    # st.write(op_docs)
-    return op_docs
-search = DuckDuckGoSearchAPIWrapper()
-tools = [
-    Tool(
-        name="Library",
-        func=Library,
-        description="A source of information pertinent to your question. Do not answer a question without consulting this!"
-    ),
-    Tool(
-        name="Search",
-        func=search.run,
-        description="useful for when you need to look up knowledge about common topics or current events",
-    )
-]
-if 'tools' not in st.session_state:
-    st.session_state.tools = tools
-# for another question type:
-# First, find the quotes from the document that are most relevant to answering the question, and then print them in numbered order.
-# Quotes should be relatively short. If there are no relevant quotes, write “No relevant quotes” instead.
-gen_llm = openai_llm(temperature=0,model_name='gpt-4o-mini', openai_api_key = openai_key)
-template = """You are an expert astronomer and cosmologist.
-Answer the following question as best you can using information from the library, but speaking in a concise and factual manner.
-If you can not come up with an answer, say you do not know.
-Try to break the question down into smaller steps and solve it in a logical manner.
-You have access to the following tools:
-{tools}
-Use the following format:
-Question: the input question you must answer
-Thought: you should always think about what to do
-Action: the action to take, should be one of [{tool_names}]
-Action Input: the input to the action
-Observation: the result of the action
-... (this Thought/Action/Action Input/Observation can repeat N times)
-Thought: I now know the final answer
-Final Answer: the final answer to the original input question. provide information about how you arrived at the answer, and any nuances or uncertainties the reader should be aware of
-Begin! Remember to speak in a pedagogical and factual manner."
-Question: {input}
-Thought:{agent_scratchpad}"""
-prompt = hub.pull("hwchase17/react")
-prompt.template=template
-from langchain.callbacks import FileCallbackHandler
-from langchain.callbacks.manager import CallbackManager
-# timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-# file_path = f"agent_trace_{timestamp}.txt"
-file_path = "agent_trace.txt"
-file_handler = FileCallbackHandler(file_path)
-callback_manager=CallbackManager([file_handler])
-tool_names = [tool.name for tool in st.session_state.tools]
-if 'agent' not in st.session_state:
-    # agent = ZeroShotAgent(llm_chain=llm_chain, allowed_tools=tool_names)
-    agent = create_react_agent(llm=gen_llm, tools=tools, prompt=prompt)
-    st.session_state.agent = agent
-if 'agent_executor' not in st.session_state:
-    agent_executor = AgentExecutor(agent=st.session_state.agent, tools=st.session_state.tools, verbose=True, handle_parsing_errors=True, callbacks=CallbackManager([file_handler]))
-    st.session_state.agent_executor = agent_executor
 # Streamlit app
 def main():
     # st.title("Question Answering App")
     # Sidebar (Inputs)
     st.sidebar.header("Fine-tune the search")
     top_k = st.sidebar.slider("Number of papers to retrieve:", 3, 30, 10)
     extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")
     st.sidebar.subheader("Toggles")
-    toggle_a = st.sidebar.checkbox("Weight by keywords")
-    toggle_b = st.sidebar.checkbox("weight by time")
-    toggle_c = st.sidebar.checkbox("Weight by citations")
-    method = st.sidebar.radio("Choose a method:", ["Semantic search", "Semantic search + HyDE", "Semantic search + HyDE + CoHERE"])
-    question_type = st.sidebar.selectbox("Select question type:", ["Single paper", "Multi-paper", "Summary"])
-    # store_output = st.sidebar.checkbox("Store the output")
     store_output = st.sidebar.button("Save output")
     # Main page (Outputs)
     query = st.text_input("Ask me anything:")
     submit_button = st.button("Submit")
     if submit_button:
-        # Process inputs
-        keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
-        toggles = {'Keyword weighting': toggle_a, 'Time weighting': toggle_b, 'Citation weighting': toggle_c}
-        # Generate outputs
-        answer, rs = answer_question(query, top_k, keywords, toggles, method, question_type)
-        papers_df = get_papers(rs)
-        embedding_plot = create_embedding_plot(rs)
-        triggered_keywords = extract_keywords(query)
-        consensus = estimate_consensus()
-        # Display outputs
-        answer = st.session_state.agent_executor.invoke({"input": query,})
-        st.write(answer["output"])
-        with open(file_path, 'r') as file:
-            intermediate_steps = file.read()
-        st.expander('Intermediate steps', expanded=False).write(intermediate_steps)
-        # st.write(answer)
         with st.expander("Relevant papers", expanded=True):
             # st.dataframe(papers_df, hide_index=True)
             st.data_editor(papers_df,
-                           column_config = {'Link':st.column_config.LinkColumn(display_text= 'https://ui.adsabs.harvard.edu/abs/(.*?)/abstract')}
                            )
         with st.expander("Embedding map", expanded=False):
             st.bokeh_chart(embedding_plot)
         col1, col2 = st.columns(2)
         with col1:
-            st.subheader("Question Type")
-            st.write(question_type)
-            st.subheader("Triggered Keywords")
-            st.write(", ".join(triggered_keywords))
         with col2:
-            st.subheader("Consensus Estimate")
-            st.write(f"{consensus:.2%}")
-        # st.subheader("Papers Used")
-            # st.dataframe(papers_df)
     else:
-        st.info("Use the sidebar to input parameters and submit to see results.")
     if store_output:
         st.toast("Output stored successfully!")
 if __name__ == "__main__":
-    main()

 from typing import List, Dict, Any, Tuple
 from collections import defaultdict
 from tqdm import tqdm
+import pandas as pd
 from datetime import datetime, date
 from datasets import load_dataset, load_from_disk
 from collections import Counter
 from langchain import hub
 from langchain_openai import ChatOpenAI as openai_llm
+from langchain_openai import OpenAIEmbeddings
+from langchain_core.runnables import RunnableConfig, RunnablePassthrough, RunnableParallel
+from langchain_core.prompts import PromptTemplate
 from langchain_community.callbacks import StreamlitCallbackHandler
 from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
+from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import TextLoader
+from langchain.agents import create_react_agent, Tool, AgentExecutor
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.output_parsers import StrOutputParser
+from langchain.callbacks import FileCallbackHandler
+from langchain.callbacks.manager import CallbackManager
+import instructor
+from pydantic import BaseModel, Field
+from typing import List, Literal
 from nltk.corpus import stopwords
 import nltk
 from openai import OpenAI
+# import anthropic
 import cohere
 import faiss
     nltk.download('stopwords')
     stopwords.words('english')
 from bokeh.plotting import figure
 from bokeh.models import ColumnDataSource
 from bokeh.io import output_notebook
 from bokeh.palettes import Spectral5
 from bokeh.transform import linear_cmap
+ts = time.time()
+# anthropic_key = st.secrets["anthropic_key"]
+openai_key = st.secrets["openai_key"]
+cohere_key = st.secrets['cohere_key']
+gen_llm = openai_llm(temperature=0,model_name='gpt-4o-mini', openai_api_key = openai_key)
+consensus_client = instructor.patch(OpenAI(api_key=openai_key))
+embed_client = OpenAI(api_key = openai_key)
+embed_model = "text-embedding-3-small"
+embeddings = OpenAIEmbeddings(model = embed_model, api_key = openai_key)
 st.image('local_files/pathfinder_logo.png')
         """
         Pathfinder v2.0 is a framework for searching and visualizing astronomy papers on the [arXiv](https://arxiv.org/) and [ADS](https://ui.adsabs.harvard.edu/) using the context
         sensitivity from modern large language models (LLMs) to better parse patterns in paper contexts.
         This tool was built during the [JSALT workshop](https://www.clsp.jhu.edu/2024-jelinek-summer-workshop-on-speech-and-language-technology/) to do awesome things.
+        **👈 Use the sidebar to tweak the search parameters to get better results**.
         ### Tool summary:
         - Please wait while the initial data loads and compiles, this takes about a minute initially.
         This is not meant to be a replacement to existing tools like the
         [ADS](https://ui.adsabs.harvard.edu/),
         that otherwise might be missed during a literature survey.
         It is trained on astro-ph (astrophysics of galaxies) papers up to last-year-ish mined from arxiv and supplemented with ADS metadata,
         if you are interested in extending it please reach out!
+        Also add: feedback form, socials, literature, contact us, copyright, collaboration, etc.
         The image below shows a representation of all the astro-ph.GA papers that can be explored in more detail
         using the `Arxiv embedding` page. The papers tend to cluster together by similarity, and result in an
         atlas that shows well studied (forests) and currently uncharted areas (water).
         """
     )
+# ---------------- get data and set up session state ---------------------------
 if 'arxiv_corpus' not in st.session_state:
     with st.spinner('loading data...'):
+        try:
             arxiv_corpus = load_from_disk('data/')
         except:
             st.write('downloading data')
+            # arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data',split='train')
+            arxiv_corpus = load_dataset('kiyer/pathfinder_arxiv_data_galaxy',split='train')
             arxiv_corpus.save_to_disk('data/')
         arxiv_corpus.add_faiss_index('embed')
         st.session_state.arxiv_corpus = arxiv_corpus
         st.toast('loaded arxiv corpus')
 else:
     arxiv_corpus = st.session_state.arxiv_corpus
 if 'ids' not in st.session_state:
     st.session_state.ids = arxiv_corpus['ads_id']
     st.session_state.titles = arxiv_corpus['title']
     st.session_state.years = arxiv_corpus['date']
     st.session_state.kws = arxiv_corpus['keywords']
     st.toast('done caching. time taken: %.2f sec' %(time.time()-ts))
 #---------------------------------------------------------------
 # A hack to "clear" the previous result when submitting a new prompt. This avoids
         return True
     return False
+# ---------------- define embedding retrieval systems --------------------------
+def get_keywords(text):
+    result = []
+    pos_tag = ['PROPN', 'ADJ', 'NOUN']
+    doc = nlp(text.lower())
+    for token in doc:
+        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
+            continue
+        if(token.pos_ in pos_tag):
+            result.append(token.text)
+    return result
+def parse_doc(text, nret = 10):
+    local_kws = []
+    doc = nlp(text)
+    # examine the top-ranked phrases in the document
+    for phrase in doc._.phrases[:nret]:
+        # print(phrase.text)
+        local_kws.append(phrase.text)
+    return local_kws
 class EmbeddingRetrievalSystem():
     def __init__(self, weight_citation = False, weight_date = False, weight_keywords = False):
         self.ids = st.session_state.ids
         self.years = st.session_state.years
         self.abstract = st.session_state.abstracts
         self.embed_model = "text-embedding-3-small"
         self.dataset = arxiv_corpus
         self.kws = st.session_state.kws
+        self.cites = st.session_state.cites
         self.weight_citation = weight_citation
         self.weight_date = weight_date
         self.weight_keywords = weight_keywords
         # self.citation_filter = CitationFilter(self.dataset)
         # self.date_filter = DateFilter(self.dataset['date'])
+        # self.keyword_filter = KeywordFilter(corpus=self.dataset, remove_capitals=True)
     def parse_date(self, id):
         # indexval = np.where(self.ids == id)[0][0]
         embeddings = self.client.embeddings.create(input=texts, model=self.embed_model).data
         return [np.array(embedding.embedding, dtype=np.float32) for embedding in embeddings]
     def get_query_embedding(self, query):
         return self.make_embedding(query)
         # xq = query_embedding.reshape(-1,1).T.astype('float32')
         # D, I = self.index.search(xq, top_k)
         # return I[0], D[0]
+        tmp = self.dataset.search('embed', query_embedding, k=top_k)
         return [tmp.indices, tmp.scores]
     def rank_and_filter(self, query, query_embedding, query_date, top_k = 10, return_scores=False, time_result=None):
+        # st.write('status')
+        # st.write('toggles', self.toggles)
+        # st.write('question_type', self.question_type)
+        # st.write('rag method', self.rag_method)
+        # st.write('gen method', self.gen_method)
+        self.weight_keywords = self.toggles["Keyword weighting"]
+        self.weight_date = self.toggles["Time weighting"]
+        self.weight_citation = self.toggles["Citation weighting"]
+        topk_indices, similarities = self.calc_faiss(np.array(query_embedding), top_k = 1000)
+        similarities = 1/similarities # converting from a distance (less is better) to a similarity (more is better)
+        query_kws = get_keywords(query)
+        input_kws = self.query_input_keywords
+        query_kws = query_kws + input_kws
+        self.query_kws = query_kws
+        if self.weight_keywords == True:
+            sub_kws = [self.kws[i] for i in topk_indices]
+            kw_weight = np.zeros((len(topk_indices),)) + 0.1
+            for k in query_kws:
+                for i in (range(len(topk_indices))):
+                    for j in range(len(sub_kws[i])):
+                        if k.lower() in sub_kws[i][j].lower():
+                            kw_weight[i] = kw_weight[i] + 0.1
+                            # print(i, k, sub_kws[i][j])
+            # kw_weight = kw_weight**0.36 / np.amax(kw_weight**0.36)
+            kw_weight = kw_weight / np.amax(kw_weight)
+        else:
+            kw_weight = np.ones((len(topk_indices),))
+        if self.weight_date == True:
+            sub_dates = [self.years[i] for i in topk_indices]
+            date = datetime.now().date()
+            date_diff = np.array([((date - i).days / 365.) for i in sub_dates])
+            # age_weight = (1 + np.exp(date_diff/2.1))**(-1) + 0.5
+            age_weight = (1 + np.exp(date_diff/0.7))**(-1)
+            age_weight = age_weight / np.amax(age_weight)
+        else:
+            age_weight = np.ones((len(topk_indices),))
+        if self.weight_citation == True:
+            # st.write('weighting by citations')
+            sub_cites = np.array([self.cites[i] for i in topk_indices])
+            temp = sub_cites.copy()
+            temp[sub_cites > 300] = 300.
+            cite_weight = (1 + np.exp((300-temp)/42.0))**(-1.)
+            cite_weight = cite_weight / np.amax(cite_weight)
+        else:
+            cite_weight = np.ones((len(topk_indices),))
+        similarities = similarities * (kw_weight) * (age_weight) * (cite_weight)
+        # if self.weight_keywords:
+        #     keyword_matches = self.keyword_filter.filter(query)
+        #     self.query_kws = keyword_matches
+        #     kw_indices = np.zeros_like(similarities)
+        #     for s in keyword_matches:
+        #         if self.id_to_index[s] in topk_indices:
+        #             # print('yes', self.id_to_index[s], topk_indices[np.where(topk_indices == self.id_to_index[s])[0]])
+        #             similarities[np.where(topk_indices == self.id_to_index[s])[0]] = similarities[np.where(topk_indices == self.id_to_index[s])[0]] * 10.
+        #     similarities = similarities / 10.
         filtered_results = [[topk_indices[i], similarities[i]] for i in range(len(similarities))]
         top_results = sorted(filtered_results, key=lambda x: x[1], reverse=True)[:top_k]
         # Only keep the document IDs
         top_results = [doc[0] for doc in top_results]
+        return top_results
     def retrieve(self, query, top_k, time_result=None, query_date = None, return_scores = False):
         query_embedding = self.get_query_embedding(query)
         # Judge time relevance
         if time_result is None:
+            if self.weight_date:
                 time_result, time_taken = self.analyze_temporal_query(query, self.anthropic_client)
+            else:
                 time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
+        top_results = self.rank_and_filter(query,
+                                           query_embedding,
+                                           query_date,
+                                           top_k,
+                                           return_scores = return_scores,
                                            time_result = time_result)
         return top_results
 class HydeRetrievalSystem(EmbeddingRetrievalSystem):
+    def __init__(self, generation_model: str = "claude-3-haiku-20240307",
+                 embedding_model: str = "text-embedding-3-small",
+             temperature: float = 0.5,
+                 max_doclen: int = 500,
+                 generate_n: int = 1,
+                 embed_query = True,
                  conclusion = False, **kwargs):
         # Handle the kwargs for the superclass init -- filters/citation weighting
         super().__init__(**kwargs)
         if max_doclen * generate_n > 8191:
             raise ValueError("Too many tokens. Please reduce max_doclen or generate_n.")
         self.embedding_model = embedding_model
         self.generation_model = generation_model
         self.embed_query = embed_query # embed the query vector?
         self.conclusion = conclusion # generate conclusion as well?
+        # self.anthropic_key = anthropic_key
+        # self.generation_client = anthropic.Anthropic(api_key = self.anthropic_key)
+        self.generation_client = openai_llm(temperature=0,model_name='gpt-4o-mini', openai_api_key = openai_key)
     def retrieve(self, query: str, top_k: int = 10, return_scores = False, time_result = None) -> List[Tuple[str, str, float]]:
         if time_result is None:
             if self.weight_date: time_result, time_taken = analyze_temporal_query(query, self.anthropic_client)
             else: time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
         docs = self.generate_docs(query)
+        st.expander('Abstract generated with hyde', expanded=False).write(docs)
         doc_embeddings = self.embed_docs(docs)
+        if self.embed_query:
             query_emb = self.embed_docs([query])[0]
             doc_embeddings.append(query_emb)
         embedding = np.mean(np.array(doc_embeddings), axis = 0)
         top_results = self.rank_and_filter(query, embedding, query_date=None, top_k = top_k, return_scores = return_scores, time_result = time_result)
         return top_results
     def generate_doc(self, query: str):
+        prompt = """You are an expert astronomer. Given a scientific query, generate the abstract of an expert-level research paper
                             that answers the question. Stick to a maximum length of {} tokens and return just the text of the abstract and conclusion.
                             Do not include labels for any section. Use research-specific jargon.""".format(self.max_doclen)
+        # st.write('invoking hyde generation')
+        # message = self.generation_client.messages.create(
+        #         model = self.generation_model,
+        #         max_tokens = self.max_doclen,
+        #         temperature = self.temperature,
+        #         system = prompt,
+        #         messages=[{ "role": "user",
+        #                 "content": [{"type": "text", "text": query,}] }]
+        #     )
+        # return message.content[0].text
+        messages = [("system",prompt,),("human", query),]
+        return self.generation_client.invoke(messages).content
     def generate_docs(self, query: str):
         docs = []
+        for i in range(self.generate_n):
+            # st.write('invoking hyde generation2')
+            docs.append(self.generate_doc(query))
+        # with concurrent.futures.ThreadPoolExecutor() as executor:
+        #     st.write('invoking hyde generation2')
+        #     future_to_query = {executor.submit(self.generate_doc, query): query for i in range(self.generate_n)}
+        #     for future in concurrent.futures.as_completed(future_to_query):
+        #         query = future_to_query[future]
+        #         try:
+        #             data = future.result()
+        #             docs.append(data)
+        #         except Exception as exc:
+        #             pass
         return docs
     def embed_docs(self, docs: List[str]):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self.cohere_key = cohere_key
         self.cohere_client = cohere.Client(self.cohere_key)
+    def retrieve(self, query: str,
+                 top_k: int = 10,
                  rerank_top_k: int = 250,
                  return_scores = False, time_result = None,
                  reweight = False) -> List[Tuple[str, str, float]]:
         if time_result is None:
             if self.weight_date: time_result, time_taken = analyze_temporal_query(query, self.anthropic_client)
             else: time_result = {'has_temporal_aspect': False, 'expected_year_filter': None, 'expected_recency_weight': None}
         top_results = super().retrieve(query, top_k = rerank_top_k, time_result = time_result)
         # doc_texts = self.get_document_texts(top_results)
         # docs_for_rerank = [f"Abstract: {doc['abstract']}\nConclusions: {doc['conclusions']}" for doc in doc_texts]
         docs_for_rerank = [self.abstract[i] for i in top_results]
         if len(docs_for_rerank) == 0:
             return []
         reranked_results = self.cohere_client.rerank(
             query=query,
             documents=docs_for_rerank,
             model='rerank-english-v3.0',
             top_n=top_k
         )
         final_results = []
         for result in reranked_results.results:
             doc_id = top_results[result.index]
         if reweight:
             if time_result['has_temporal_aspect']:
                 final_results = self.date_filter.filter(final_results, time_score = time_result['expected_recency_weight'])
             if self.weight_citation: self.citation_filter.filter(final_results)
         if return_scores:
             return {result[0]: result[2] for result in final_results}
         return self.embed_batch(docs)
 # ----------------------------------------------------------------
 if 'ec' not in st.session_state:
+    ec = HydeCohereRetrievalSystem(weight_keywords=True)
     st.session_state.ec = ec
     st.toast('loaded retrieval system')
 else:
     ec = st.session_state.ec
+def get_topk(query, top_k):
+    print('running retrieval')
+    rs = st.session_state.ec.retrieve(query, top_k, return_scores=True)
+    return rs
+def Library(query, top_k = 7):
+    rs = get_topk(query, top_k = top_k)
+    op_docs = ''
+    for paperno, i in enumerate(rs):
+        op_docs = op_docs + 'Paper %.0f:' %(paperno+1) +' (published in '+st.session_state.arxiv_corpus['bibcode'][i][0:4] + ') ' + st.session_state.titles[i]  + '\n' + st.session_state.abstracts[i] + '\n\n'
+    return op_docs
+def Library2(query, top_k = 7):
+    rs = get_topk(query, top_k = top_k)
+    absts, fnames = [], []
+    for paperno, i in enumerate(rs):
+        absts.append(st.session_state.abstracts[i])
+        fnames.append(st.session_state.arxiv_corpus['bibcode'][i])
+    return absts, fnames, rs
+def get_paper_df(ids):
+    papers, scores, yrs, links, cites, kws = [], [], [], [], [], []
     for i in ids:
         papers.append(st.session_state.titles[i])
         scores.append(ids[i])
         links.append('https://ui.adsabs.harvard.edu/abs/'+st.session_state.arxiv_corpus['bibcode'][i]+'/abstract')
+        yrs.append(st.session_state.arxiv_corpus['bibcode'][i][0:4])
+        cites.append(st.session_state.arxiv_corpus['cites'][i])
+        kws.append(st.session_state.arxiv_corpus['ads_keywords'][i])
     return pd.DataFrame({
         'Title': papers,
         'Relevance': scores,
+        'Year': yrs,
+        'ADS Link': links,
+        'Citations': cites,
+        'Keywords': kws,
     })
+# def find_outliers(inp_simids, arxiv_cutoff_distance = 0.8):
+#
+#     inp_simids = np.array(inp_simids)
+#
+#     # Calculate the centroid for each point, excluding itself
+#     orange_black_points = st.session_state.embed[inp_simids]
+#
+#     topk_dists = []
+#     for i, point in enumerate(orange_black_points):
+#         # Exclude the current point
+#         other_points = np.delete(orange_black_points, i, axis=0)
+#         # Calculate centroid of other points
+#         centroid = np.mean(other_points, axis=0)
+#         # Calculate distance from the point to this centroid
+#         dist = np.sqrt(np.sum((point - centroid)**2))
+#         topk_dists.append(dist)
+#
+#     topk_dists = np.array(topk_dists)
+#
+#     # Separate distances for orange and black points
+#     orange_distances = topk_dists[:len(inp_simids)]
+#     black_distances = topk_dists[len(inp_simids):]
+#
+#     # Calculate the median of distances
+#     orange_black_distances = topk_dists
+#     median_topk_distance = np.median(orange_black_distances)
+#
+#     # def get_sims_and_dists(inp_data):
+#
+#     #     all_sims, all_dists = [], []
+#
+#     #     np.random.seed(12)
+#     #     rand_indices = np.random.choice(inp_data.shape[0], size=return_n, replace=False)
+#
+#     #     for j in tqdm(range(len(rand_indices))):
+#
+#     #         i = rand_indices[j]
+#     #         inferred_vector = inp_data[i,:]
+#     #         sims, dists = find_closest_dists(i, inp_data, return_n + 1)
+#     #         all_sims.append(sims[1:])
+#     #         all_dists.append(dists[1:])
+#
+#     #     return np.array(all_sims), np.array(all_dists)
+#
+#     # # Identify papers with distances greater than the 95th percentile
+#     # _, all_dists = get_sims_and_dists(arxiv_ada_embeddings)
+#     # arxiv_cutoff_distance = find_cutoff_dist(all_dists)
+#     # hardcoding for now
+#     outlier_indices = inp_simids[np.where(orange_black_distances > arxiv_cutoff_distance)[0]]
+#     # outlier_titles = [titles[i] for i in outlier_indices]
+#
+#     return outlier_indices #, outlier_titles
 def create_embedding_plot(rs):
+    """
+    function to create embedding plot
+    """
     pltsource = ColumnDataSource(data=dict(
         x=st.session_state.arxiv_corpus['umap_x'],
         title=st.session_state.titles,
         link=st.session_state.arxiv_corpus['bibcode'],
     ))
     rsflag = np.zeros((len(st.session_state.ids),))
     rsflag[np.array([k for k in rs])] = 1
+    # outflag = np.zeros((len(st.session_state.ids),))
+    # outflag[np.array([k for k in find_outliers(rs)])] = 1
     pltsource.data['colors'] = rsflag * 0.8 + 0.1
+    # pltsource.data['colors'][outflag] = 0.5
     pltsource.data['sizes'] = (rsflag + 1)**5 / 100
     TOOLTIPS = """
     @link <br> <br>
     </div>
     """
     mapper = linear_cmap(field_name="colors", palette=Spectral5, low=0., high=1.)
     p = figure(width=700, height=900, tooltips=TOOLTIPS, x_range=(0, 20), y_range=(-4.2,18),
             title="UMAP projection of embeddings for the astro-ph corpus")
     p.axis.visible=False
     p.grid.visible=False
     p.outline_line_alpha = 0.
     p.circle('x', 'y', radius='sizes', source=pltsource, alpha=0.3, fill_color=mapper, fill_alpha='colors', line_color="lightgrey",line_alpha=0.1)
     return p
+def extract_keywords(question, ec):
     # Simulated keyword extraction (replace with actual logic)
     return ['keyword1', 'keyword2', 'keyword3']
     # Simulated consensus estimation (replace with actual calculation)
     return 0.75
+def run_agent_qa(query, top_k):
+    # define tools
+    search = DuckDuckGoSearchAPIWrapper()
+    tools = [
+        Tool(
+            name="Library",
+            func=Library,
+            description="A source of information pertinent to your question. Do not answer a question without consulting this!"
+        ),
+        Tool(
+            name="Search",
+            func=search.run,
+            description="useful for when you need to look up knowledge about common topics or current events",
+        )
+    ]
+    if 'tools' not in st.session_state:
+        st.session_state.tools = tools
+    # define prompt
+    # for another question type:
+    # First, find the quotes from the document that are most relevant to answering the question, and then print them in numbered order.
+    # Quotes should be relatively short. If there are no relevant quotes, write “No relevant quotes” instead.
+    template = """You are an expert astronomer and cosmologist.
+    Answer the following question as best you can using information from the library, but speaking in a concise and factual manner.
+    If you can not come up with an answer, say you do not know.
+    Try to break the question down into smaller steps and solve it in a logical manner.
+    You have access to the following tools:
+    {tools}
+    Use the following format:
+    Question: the input question you must answer
+    Thought: you should always think about what to do
+    Action: the action to take, should be one of [{tool_names}]
+    Action Input: the input to the action
+    Observation: the result of the action
+    ... (this Thought/Action/Action Input/Observation can repeat N times)
+    Thought: I now know the final answer
+    Final Answer: the final answer to the original input question. provide information about how you arrived at the answer, and any nuances or uncertainties the reader should be aware of
+    Begin! Remember to speak in a pedagogical and factual manner."
+    Question: {input}
+    Thought:{agent_scratchpad}"""
+    prompt = hub.pull("hwchase17/react")
+    prompt.template=template
+    # path to write intermediate trace to
+    file_path = "agent_trace.txt"
+    try:
+        os.remove(file_path)
+    except:
+        pass
+    file_handler = FileCallbackHandler(file_path)
+    callback_manager=CallbackManager([file_handler])
+    # define and execute agent
+    tool_names = [tool.name for tool in st.session_state.tools]
+    if 'agent' not in st.session_state:
+        # agent = ZeroShotAgent(llm_chain=llm_chain, allowed_tools=tool_names)
+        agent = create_react_agent(llm=gen_llm, tools=tools, prompt=prompt)
+        st.session_state.agent = agent
+    if 'agent_executor' not in st.session_state:
+        agent_executor = AgentExecutor(agent=st.session_state.agent, tools=st.session_state.tools, verbose=True, handle_parsing_errors=True, callbacks=CallbackManager([file_handler]))
+        st.session_state.agent_executor = agent_executor
+    answer = st.session_state.agent_executor.invoke({"input": query,})
+    return answer
+def make_rag_qa_answer(query, top_k = 10):
+    absts, fhdrs, rs = Library2(query, top_k = top_k)
+    temp_abst = ''
+    loaders = []
+    for i in range(len(absts)):
+        temp_abst = absts[i]
+        try:
+            text_file = open("absts/"+fhdrs[i]+".txt", "w")
+        except:
+            os.mkdir('absts')
+            text_file = open("absts/"+fhdrs[i]+".txt", "w")
+        n = text_file.write(temp_abst)
+        text_file.close()
+        loader = TextLoader("absts/"+fhdrs[i]+".txt")
+        loaders.append(loader)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=50, add_start_index=True)
+    splits = text_splitter.split_documents([loader.load()[0] for loader in loaders])
+    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, collection_name='retdoc4')
+    # retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6,  "fetch_k": len(splits)})
+    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
+    for i in range(len(absts)):
+        os.remove("absts/"+fhdrs[i]+".txt")
+    template = """You are an expert astronomer and cosmologist.
+    Answer the following question as best you can using information from the library, but speaking in a concise and factual manner.
+    If you can not come up with an answer, say you do not know.
+    Try to break the question down into smaller steps and solve it in a logical manner.
+    Provide information about how you arrived at the answer, and any nuances or uncertainties the reader should be aware of.
+    Begin! Remember to speak in a pedagogical and factual manner."
+    Relevant documents:{context}
+    Question: {question}
+    Answer:"""
+    prompt = PromptTemplate.from_template(template)
+    def format_docs(docs):
+        return "\n\n".join(doc.page_content for doc in docs)
+    rag_chain_from_docs = (
+        RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
+        | prompt
+        | gen_llm
+        | StrOutputParser()
+    )
+    rag_chain_with_source = RunnableParallel(
+        {"context": retriever, "question": RunnablePassthrough()}
+    ).assign(answer=rag_chain_from_docs)
+    rag_answer = rag_chain_with_source.invoke(query, )
+    vectorstore.delete_collection()
+    return rag_answer, rs
+def guess_question_type(query: str):
+    categorization_prompt = """You are an expert astrophysicist and computer scientist specializing in linguistics and semantics. Your task is to categorize a given query into one of the following categories:
+    1. Summarization
+    2. Single-paper factual
+    3. Multi-paper factual
+    4. Named entity recognition
+    5. Jargon-specific questions / overloaded words
+    6. Time-sensitive
+    7. Consensus evaluation
+    8. What-ifs and counterfactuals
+    9. Compositional
+    Analyze the query carefully, considering its content, structure, and implications. Then, determine which of the above categories best fits the query.
+    In your analysis, consider the following:
+    - Does the query ask for a well-known datapoint or mechanism?
+    - Can it be answered by a single paper or does it require multiple sources?
+    - Does it involve proper nouns or specific scientific terms?
+    - Is it time-dependent or likely to change in the near future?
+    - Does it require evaluating consensus across multiple sources?
+    - Is it a hypothetical or counterfactual question?
+    - Does it need to be broken down into sub-queries (i.e. compositional)?
+    After your analysis, categorize the query into one of the nine categories listed above.
+    Provide a brief explanation for your categorization, highlighting the key aspects of the query that led to your decision.
+    Present your final answer in the following format:
+    <categorization>
+    Category: [Selected category]
+    Explanation: [Your explanation for the categorization]
+    </categorization>"""
+    # st.write('invoking hyde generation')
+    # message = self.generation_client.messages.create(
+    #         model = self.generation_model,
+    #         max_tokens = self.max_doclen,
+    #         temperature = self.temperature,
+    #         system = prompt,
+    #         messages=[{ "role": "user",
+    #                 "content": [{"type": "text", "text": query,}] }]
+    #     )
+    # return message.content[0].text
+    messages = [("system",categorization_prompt,),("human", query),]
+    return st.session_state.ec.generation_client.invoke(messages).content
+class OverallConsensusEvaluation(BaseModel):
+    consensus: Literal["Strong Agreement", "Moderate Agreement", "Weak Agreement", "No Clear Consensus", "Weak Disagreement", "Moderate Disagreement", "Strong Disagreement"] = Field(
+        ...,
+        description="The overall level of consensus between the query and the abstracts"
+    )
+    explanation: str = Field(
+        ...,
+        description="A detailed explanation of the consensus evaluation"
+    )
+    relevance_score: float = Field(
+        ...,
+        description="A score from 0 to 1 indicating how relevant the abstracts are to the query overall",
+        ge=0,
+        le=1
+    )
+def evaluate_overall_consensus(query: str, abstracts: List[str]) -> OverallConsensusEvaluation:
+    """
+    Evaluates the overall consensus of the abstracts in relation to the query in a single LLM call.
+    """
+    prompt = f"""
+    Query: {query}
+    You will be provided with {len(abstracts)} scientific abstracts. Your task is to:
+    1. Evaluate the overall consensus between the query and the abstracts.
+    2. Provide a detailed explanation of your consensus evaluation.
+    3. Assign an overall relevance score from 0 to 1, where 0 means completely irrelevant and 1 means highly relevant.
+    For the consensus evaluation, use one of the following levels:
+    Strong Agreement, Moderate Agreement, Weak Agreement, No Clear Consensus, Weak Disagreement, Moderate Disagreement, Strong Disagreement
+    Here are the abstracts:
+    {' '.join([f"Abstract {i+1}: {abstract}" for i, abstract in enumerate(abstracts)])}
+    Provide your evaluation in a structured format.
+    """
+    response = consensus_client.chat.completions.create(
+        model="gpt-4",
+        response_model=OverallConsensusEvaluation,
+        messages=[
+            {"role": "system", "content": """You are an assistant with expertise in astrophysics for question-answering tasks.
+            Evaluate the overall consensus of the retrieved scientific abstracts in relation to a given query.
+            If you don't know the answer, just say that you don't know.
+            Use six sentences maximum and keep the answer concise."""},
+            {"role": "user", "content": prompt}
+        ],
+        temperature=0
+    )
+    return response
 # Streamlit app
 def main():
     # st.title("Question Answering App")
     # Sidebar (Inputs)
     st.sidebar.header("Fine-tune the search")
     top_k = st.sidebar.slider("Number of papers to retrieve:", 3, 30, 10)
     extra_keywords = st.sidebar.text_input("Enter extra keywords (comma-separated):")
     st.sidebar.subheader("Toggles")
+    toggle_a = st.sidebar.toggle("Weight by keywords", value = False)
+    toggle_b = st.sidebar.toggle("Weight by date", value = False)
+    toggle_c = st.sidebar.toggle("Weight by citations", value = False)
+    method = st.sidebar.radio("Retrieval method:", ["Semantic search", "Semantic search + HyDE", "Semantic search + HyDE + CoHERE"], index=2)
+    if (method == "Semantic search"):
+        with st.spinner('set retrieval method to'+ method):
+            st.session_state.ec = EmbeddingRetrievalSystem(weight_keywords=True)
+    elif (method == "Semantic search + HyDE"):
+        with st.spinner('set retrieval method to'+ method):
+            st.session_state.ec = HydeRetrievalSystem(weight_keywords=True)
+    elif (method == "Semantic search + HyDE + CoHERE"):
+        with st.spinner('set retrieval method to'+ method):
+            st.session_state.ec = HydeCohereRetrievalSystem(weight_keywords=True)
+    method2 = st.sidebar.radio("Generation complexity:", ["Basic RAG","ReAct Agent"])
+    if method2 == "Basic RAG":
+        st.session_state.gen_method = 'rag'
+    elif method2 == "ReAct Agent":
+        st.session_state.gen_method = 'agent'
+    question_type = st.sidebar.selectbox("Select question type:", ["Single paper", "Multi-paper", "Summary"])
     store_output = st.sidebar.button("Save output")
     # Main page (Outputs)
+    # st.markdown("""
+    # <style>
+    #     .stTextInput > div > div { font-size: 50px; }
+    # </style>
+    # """, unsafe_allow_html=True)
+    # st.markdown(
+    #     """
+    #     <style>
+    #     textarea {
+    #         font-size: 3rem !important;
+    #         font-weight: bold;
+    #         font-family: "Times New Roman", Times, serif;
+    #     }
+    #     input {
+    #         font-size: 3rem !important;
+    #         font-weight: bold;
+    #         font-family: "Times New Roman", Times, serif;
+    #     }
+    #     </style>
+    #     """,
+    #     unsafe_allow_html=True,
+    # )
+    # query = st.text_area("Ask me anything:", height=30)
     query = st.text_input("Ask me anything:")
     submit_button = st.button("Submit")
     if submit_button:
+        search_text_list = ['rooting around in the paper pile...','looking for clarity...','scanning the event horizon...','peering into the abyss...','potatoes power this ongoing search...']
+        with st.spinner(search_text_list[np.random.choice(len(search_text_list))]):
+            # Process inputs
+            keywords = [kw.strip() for kw in extra_keywords.split(',')] if extra_keywords else []
+            toggles = {'Keyword weighting': toggle_a, 'Time weighting': toggle_b, 'Citation weighting': toggle_c}
+            # Generate outputs
+            st.session_state.ec.query_input_keywords = keywords
+            st.session_state.ec.toggles = toggles
+            st.session_state.ec.question_type = question_type
+            st.session_state.ec.rag_method = method
+            st.session_state.ec.gen_method = method2
+            # Display outputs
+            if st.session_state.gen_method == 'agent':
+                answer = run_agent_qa(query, top_k)
+                rs = get_topk(query, top_k)
+                st.write(answer["output"])
+                file_path = "agent_trace.txt"
+                with open(file_path, 'r') as file:
+                    intermediate_steps = file.read()
+                st.expander('Intermediate steps', expanded=False).write(intermediate_steps)
+            elif st.session_state.gen_method == 'rag':
+                answer, rs = make_rag_qa_answer(query, top_k)
+                st.write(answer['answer'])
+            papers_df = get_paper_df(rs)
+            embedding_plot = create_embedding_plot(rs)
+            triggered_keywords = st.session_state.ec.query_kws
+            st.write('**Triggered keywords:** `'+ "`, `".join(triggered_keywords)+'`')
+            # consensus = estimate_consensus()
         with st.expander("Relevant papers", expanded=True):
             # st.dataframe(papers_df, hide_index=True)
             st.data_editor(papers_df,
+                           column_config = {'ADS Link':st.column_config.LinkColumn(display_text= 'https://ui.adsabs.harvard.edu/abs/(.*?)/abstract')}
                            )
         with st.expander("Embedding map", expanded=False):
             st.bokeh_chart(embedding_plot)
         col1, col2 = st.columns(2)
         with col1:
+            st.subheader("Question type suggestion")
+            question_type_gen = guess_question_type(query)
+            if '<categorization>' in question_type_gen:
+                question_type_gen = question_type_gen.split('<categorization>')[1]
+            if '</categorization>' in question_type_gen:
+                question_type_gen = question_type_gen.split('</categorization>')[0]
+            question_type_gen = question_type_gen.replace('\n','  \n')
+            st.markdown(question_type_gen)
         with col2:
+            # st.subheader("Triggered Keywords")
+            # st.write(", ".join(triggered_keywords))
+            consensus_answer = evaluate_overall_consensus(query, [st.session_state.abstracts[i] for i in rs])
+            st.subheader("Consensus: "+consensus_answer.consensus)
+            st.markdown(consensus_answer.explanation)
+            st.markdown('Relevance of retrieved papers to answer: %.1f' %consensus_answer.relevance_score)
+            # st.write(f"{consensus:.2%}")
     else:
+        st.info("Use the sidebar to tweak the search parameters to get better results.")
     if store_output:
         st.toast("Output stored successfully!")
 if __name__ == "__main__":
+    main()

data/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2534048757c630a5a9addf362d3077da0427e55ae1cae0c93dd213363ddfbcc7
+size 498031096

data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+  "builder_name": "parquet",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "pathfinder_arxiv_data_galaxy",
+  "dataset_size": 505886100,
+  "description": "",
+  "download_checksums": {
+    "hf://datasets/kiyer/pathfinder_arxiv_data_galaxy@29754b03f3cd82e4051ece1cf96605f8756bc197/data/train-00000-of-00001.parquet": {
+      "num_bytes": 379674094,
+      "checksum": null
+    }
+  },
+  "download_size": 379674094,
+  "features": {
+    "ads_id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "arxiv_id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "title": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "abstract": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "embed": {
+      "feature": {
+        "dtype": "float32",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "umap_x": {
+      "dtype": "float32",
+      "_type": "Value"
+    },
+    "umap_y": {
+      "dtype": "float32",
+      "_type": "Value"
+    },
+    "date": {
+      "dtype": "date32",
+      "_type": "Value"
+    },
+    "cites": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "bibcode": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "keywords": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "ads_keywords": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "read_count": {
+      "dtype": "int64",
+      "_type": "Value"
+    },
+    "doi": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "authors": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "aff": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "cite_bibcodes": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "ref_bibcodes": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 885560194,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 505886100,
+      "num_examples": 41195,
+      "shard_lengths": [
+        41000,
+        195
+      ],
+      "dataset_name": "pathfinder_arxiv_data_galaxy"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}

data/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "61bcd9aec14a17d4",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}

requirements.txt CHANGED Viewed

@@ -10,7 +10,8 @@ langchain_community
 langchain_core
 langchainhub
 openai
-anthropic
 feedparser
 tiktoken
 chromadb

 langchain_core
 langchainhub
 openai
+instructor
+pydantic
 feedparser
 tiktoken
 chromadb