pathfinder_v3

Running

App Files Files Community

kiyer commited on Jan 15, 2024

Commit

00bd3f7

1 Parent(s): e75fcb6

added time window applet

Browse files

Files changed (5) hide show

.DS_Store +0 -0
.gitignore +2 -0
app.py +3 -1
pages/5_research_hotspots.py +129 -0
pages/{5_qa_sources_v1.py → 6_qa_sources_v1.py} +0 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ .DS_Store

app.py CHANGED Viewed

@@ -22,7 +22,9 @@ st.markdown(
     ### Tool summary:
     - `Paper search` looks for relevant papers given an arxiv id or a question.
     - `Arxiv embedding` shows the landscape of current galaxy evolution papers (astro-ph.GA)
-    - `QA sources` brings it all together to give concise answers to questions with primary sources and relevant papers.
     This is not meant to be a replacement to existing tools like the
     [ADS](https://ui.adsabs.harvard.edu/),

     ### Tool summary:
     - `Paper search` looks for relevant papers given an arxiv id or a question.
     - `Arxiv embedding` shows the landscape of current galaxy evolution papers (astro-ph.GA)
+    - `Answering questions` brings it all together using RAG to give concise answers to questions with primary sources and relevant papers.
+    - `Author search` uses a list of authors for the papers to visualize trajectories of individual researchers or groups over time.
+    - `Research hotspots` uses paper ages to visualize excess research at a particular time in the past in different parts of the embedding space.
     This is not meant to be a replacement to existing tools like the
     [ADS](https://ui.adsabs.harvard.edu/),

pages/5_research_hotspots.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import os
+import datetime
+import faiss
+import streamlit as st
+import feedparser
+import urllib
+import cloudpickle as cp
+import pickle
+from urllib.request import urlopen
+from summa import summarizer
+import numpy as np
+import matplotlib.pyplot as plt
+import requests
+import json
+from langchain_openai import AzureOpenAIEmbeddings
+from langchain.llms import OpenAI
+from langchain_openai import AzureChatOpenAI
+os.environ["OPENAI_API_TYPE"] = "azure"
+os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
+os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
+os.environ["OPENAI_API_VERSION"] = "2023-05-15"
+embeddings = AzureOpenAIEmbeddings(
+    deployment="embedding",
+    model="text-embedding-ada-002",
+    azure_endpoint=st.secrets["endpoint1"],
+)
+llm = AzureChatOpenAI(
+        deployment_name="gpt4_small",
+        openai_api_version="2023-12-01-preview",
+        azure_endpoint=st.secrets["endpoint2"],
+        openai_api_key=st.secrets["key2"],
+        openai_api_type="azure",
+        temperature=0.
+    )
+@st.cache_data
+def get_feeds_data(url):
+    # data = cp.load(urlopen(url))
+    with open(url, "rb") as fp:
+        data = pickle.load(fp)
+    st.sidebar.success("Loaded data")
+    return data
+# feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
+# embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
+dateval = "27-Jun-2023"
+feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
+embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
+gal_feeds = get_feeds_data(feeds_link)
+arxiv_ada_embeddings = get_feeds_data(embed_link)
+@st.cache_data
+def get_embedding_data(url):
+    # data = cp.load(urlopen(url))
+    with open(url, "rb") as fp:
+        data = pickle.load(fp)
+    st.sidebar.success("Fetched data from API!")
+    return data
+# url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
+url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
+e2d = get_embedding_data(url)
+# e2d, _, _, _, _ = get_embedding_data(url)
+ctr = -1
+num_chunks = len(gal_feeds)
+ctr = -1
+num_chunks = len(gal_feeds)
+all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []
+for nc in range(num_chunks):
+    for i in range(len(gal_feeds[nc].entries)):
+        text = gal_feeds[nc].entries[i].summary
+        text = text.replace('\n', ' ')
+        text = text.replace('\\', '')
+        all_text.append(text)
+        all_titles.append(gal_feeds[nc].entries[i].title)
+        all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
+        all_links.append(gal_feeds[nc].entries[i].links[1].href)
+        all_authors.append(gal_feeds[nc].entries[i].authors)
+        temp = gal_feeds[nc].entries[i].published
+        datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
+        all_pubdates.append(datetime_object)
+        all_old.append((datetime.datetime.now() - datetime_object).days)
+def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False):
+    bw = 0.05
+    sigma = 4.0
+    mask = (np.abs(np.array(all_old) - midage*365) < tolage*365)
+    if onlyolder == True:
+        mask2 = (np.array(all_old) > midage*365 + tolage*365/2)
+        a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True)
+    else:
+        a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True)
+    b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True)
+    temp = b[0].T - a[0].T
+    temp = ndimage.gaussian_filter(temp, sigma, mode='nearest')
+    vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2
+    plt.figure(figsize=(11,9))
+    plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2,
+               temp,cmap='bwr',
+               vmin=-vscale,vmax=vscale); plt.colorbar()
+    # plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1)
+    plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage))
+    plt.axis([0,14,1,15])
+    plt.axis('off')
+    st.pyplot(fig)
+    return
+st.title('Research hotspots compared to full prior')
+st.markdown('[Includes papers up to: `'+dateval+'`]')
+midage = st.slider('Age', 0., 10., 0.)
+tolage = st.slider('Period width', 0., 10., 1.)
+st.markdown('Compare the research in a given time period to the full manifold.')
+make_time_excess_plot(midage, tolage, onlyolder = False)
+st.markdown('Compare the research in a given time period to research older than that.')
+make_time_excess_plot(midage, tolage, onlyolder = True)

pages/{5_qa_sources_v1.py → 6_qa_sources_v1.py} RENAMED Viewed

File without changes