kiyer commited on
Commit
00bd3f7
1 Parent(s): e75fcb6

added time window applet

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+
2
+ .DS_Store
app.py CHANGED
@@ -22,7 +22,9 @@ st.markdown(
22
  ### Tool summary:
23
  - `Paper search` looks for relevant papers given an arxiv id or a question.
24
  - `Arxiv embedding` shows the landscape of current galaxy evolution papers (astro-ph.GA)
25
- - `QA sources` brings it all together to give concise answers to questions with primary sources and relevant papers.
 
 
26
 
27
  This is not meant to be a replacement to existing tools like the
28
  [ADS](https://ui.adsabs.harvard.edu/),
 
22
  ### Tool summary:
23
  - `Paper search` looks for relevant papers given an arxiv id or a question.
24
  - `Arxiv embedding` shows the landscape of current galaxy evolution papers (astro-ph.GA)
25
+ - `Answering questions` brings it all together using RAG to give concise answers to questions with primary sources and relevant papers.
26
+ - `Author search` uses a list of authors for the papers to visualize trajectories of individual researchers or groups over time.
27
+ - `Research hotspots` uses paper ages to visualize excess research at a particular time in the past in different parts of the embedding space.
28
 
29
  This is not meant to be a replacement to existing tools like the
30
  [ADS](https://ui.adsabs.harvard.edu/),
pages/5_research_hotspots.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datetime
3
+ import faiss
4
+ import streamlit as st
5
+ import feedparser
6
+ import urllib
7
+ import cloudpickle as cp
8
+ import pickle
9
+ from urllib.request import urlopen
10
+ from summa import summarizer
11
+ import numpy as np
12
+ import matplotlib.pyplot as plt
13
+ import requests
14
+ import json
15
+
16
+ from langchain_openai import AzureOpenAIEmbeddings
17
+ from langchain.llms import OpenAI
18
+ from langchain_openai import AzureChatOpenAI
19
+
20
+ os.environ["OPENAI_API_TYPE"] = "azure"
21
+ os.environ["AZURE_ENDPOINT"] = st.secrets["endpoint1"]
22
+ os.environ["OPENAI_API_KEY"] = st.secrets["key1"]
23
+ os.environ["OPENAI_API_VERSION"] = "2023-05-15"
24
+
25
+ embeddings = AzureOpenAIEmbeddings(
26
+ deployment="embedding",
27
+ model="text-embedding-ada-002",
28
+ azure_endpoint=st.secrets["endpoint1"],
29
+ )
30
+
31
+ llm = AzureChatOpenAI(
32
+ deployment_name="gpt4_small",
33
+ openai_api_version="2023-12-01-preview",
34
+ azure_endpoint=st.secrets["endpoint2"],
35
+ openai_api_key=st.secrets["key2"],
36
+ openai_api_type="azure",
37
+ temperature=0.
38
+ )
39
+
40
+
41
+ @st.cache_data
42
+ def get_feeds_data(url):
43
+ # data = cp.load(urlopen(url))
44
+ with open(url, "rb") as fp:
45
+ data = pickle.load(fp)
46
+ st.sidebar.success("Loaded data")
47
+ return data
48
+
49
+ # feeds_link = "https://drive.google.com/uc?export=download&id=1-IPk1voyUM9VqnghwyVrM1dY6rFnn1S_"
50
+ # embed_link = "https://dl.dropboxusercontent.com/s/ob2betm29qrtb8v/astro_ph_ga_feeds_ada_embedding_18-Apr-2023.pkl?dl=0"
51
+ dateval = "27-Jun-2023"
52
+ feeds_link = "local_files/astro_ph_ga_feeds_upto_"+dateval+".pkl"
53
+ embed_link = "local_files/astro_ph_ga_feeds_ada_embedding_"+dateval+".pkl"
54
+ gal_feeds = get_feeds_data(feeds_link)
55
+ arxiv_ada_embeddings = get_feeds_data(embed_link)
56
+
57
+ @st.cache_data
58
+ def get_embedding_data(url):
59
+ # data = cp.load(urlopen(url))
60
+ with open(url, "rb") as fp:
61
+ data = pickle.load(fp)
62
+ st.sidebar.success("Fetched data from API!")
63
+ return data
64
+
65
+ # url = "https://drive.google.com/uc?export=download&id=1133tynMwsfdR1wxbkFLhbES3FwDWTPjP"
66
+ url = "local_files/astro_ph_ga_embedding_"+dateval+".pkl"
67
+ e2d = get_embedding_data(url)
68
+ # e2d, _, _, _, _ = get_embedding_data(url)
69
+
70
+ ctr = -1
71
+ num_chunks = len(gal_feeds)
72
+ ctr = -1
73
+ num_chunks = len(gal_feeds)
74
+ all_text, all_titles, all_arxivid, all_links, all_authors, all_pubdates, all_old = [], [], [], [], [], [], []
75
+
76
+ for nc in range(num_chunks):
77
+
78
+ for i in range(len(gal_feeds[nc].entries)):
79
+ text = gal_feeds[nc].entries[i].summary
80
+ text = text.replace('\n', ' ')
81
+ text = text.replace('\\', '')
82
+ all_text.append(text)
83
+ all_titles.append(gal_feeds[nc].entries[i].title)
84
+ all_arxivid.append(gal_feeds[nc].entries[i].id.split('/')[-1][0:-2])
85
+ all_links.append(gal_feeds[nc].entries[i].links[1].href)
86
+ all_authors.append(gal_feeds[nc].entries[i].authors)
87
+ temp = gal_feeds[nc].entries[i].published
88
+ datetime_object = datetime.datetime.strptime(temp[0:10]+' '+temp[11:-1], '%Y-%m-%d %H:%M:%S')
89
+ all_pubdates.append(datetime_object)
90
+ all_old.append((datetime.datetime.now() - datetime_object).days)
91
+
92
+ def make_time_excess_plot(midage = 0, tolage = 1, onlyolder = False):
93
+
94
+ bw = 0.05
95
+ sigma = 4.0
96
+ mask = (np.abs(np.array(all_old) - midage*365) < tolage*365)
97
+
98
+ if onlyolder == True:
99
+ mask2 = (np.array(all_old) > midage*365 + tolage*365/2)
100
+ a = np.histogram2d(e2d[0:,0][mask2], e2d[0:,1][mask2], bins=(np.arange(0,17,bw)), density=True)
101
+ else:
102
+ a = np.histogram2d(e2d[0:,0], e2d[0:,1], bins=(np.arange(0,17,bw)), density=True)
103
+ b = np.histogram2d(e2d[0:,0][mask], e2d[0:,1][mask], bins=(np.arange(0,17,bw)), density=True)
104
+ temp = b[0].T - a[0].T
105
+ temp = ndimage.gaussian_filter(temp, sigma, mode='nearest')
106
+ vscale = (np.nanpercentile(temp,99.5) - np.nanpercentile(temp,0.5))/2
107
+
108
+ plt.figure(figsize=(11,9))
109
+ plt.pcolor(a[1][0:-1] + (a[1][1]-a[1][0])/2, a[2][0:-1] + (a[2][1]-a[2][0])/2,
110
+ temp,cmap='bwr',
111
+ vmin=-vscale,vmax=vscale); plt.colorbar()
112
+ # plt.scatter(e2d[0:,0], e2d[0:,1],s=2,color='k',alpha=0.1)
113
+ plt.title('excess research over the last %.1f yrs centered at %.1f yrs' %(tolage, midage))
114
+ plt.axis([0,14,1,15])
115
+ plt.axis('off')
116
+ st.pyplot(fig)
117
+ return
118
+
119
+ st.title('Research hotspots compared to full prior')
120
+ st.markdown('[Includes papers up to: `'+dateval+'`]')
121
+
122
+ midage = st.slider('Age', 0., 10., 0.)
123
+ tolage = st.slider('Period width', 0., 10., 1.)
124
+
125
+ st.markdown('Compare the research in a given time period to the full manifold.')
126
+ make_time_excess_plot(midage, tolage, onlyolder = False)
127
+
128
+ st.markdown('Compare the research in a given time period to research older than that.')
129
+ make_time_excess_plot(midage, tolage, onlyolder = True)
pages/{5_qa_sources_v1.py → 6_qa_sources_v1.py} RENAMED
File without changes