cmulgy commited on
Commit
52d2ab4
·
1 Parent(s): 495fe69

Add application file

Browse files
Files changed (4) hide show
  1. app.py +202 -0
  2. arxiv_agent.py +530 -0
  3. requirements.txt +7 -0
  4. utils.py +740 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ from arxiv_agent import *
4
+ agent = ArxivAgent()
5
+
6
+ def set_profile(name):
7
+ # Simulate setting the profile based on the name
8
+ # Replace with actual logic to fetch and set profile
9
+ names = name.split(" ")
10
+ for n in names:
11
+ if len(n) == 0:
12
+ gr.Info("Please input standard name format.")
13
+ return None
14
+ elif n[0].islower():
15
+ gr.Info("Please input standard name format.")
16
+ return None
17
+ profile = agent.get_profile(name)
18
+ # import pdb
19
+ # pdb.set_trace()
20
+
21
+ return profile
22
+
23
+
24
+ def edit_profile(profile, author_name):
25
+ # names = author_name.split(" ")
26
+ # for n in names:
27
+ # if len(n) == 0:
28
+ # gr.Info("Please input standard name format.")
29
+ # return "", False
30
+ # elif n[0].islower():
31
+ # gr.Info("Please input standard name format.")
32
+ # return "", False
33
+
34
+ msg = agent.edit_profile(profile, author_name)
35
+ gr.Info("Edit profile successfully!")
36
+ return profile
37
+
38
+ def confirm_date(date, profile_input):
39
+ # Simulate fetching data based on the selected date
40
+ # data = request.get_json()
41
+ if len(profile_input) == 0:
42
+ topic, papers, idea = agent.select_date(date, None)
43
+ else:
44
+ topic, papers, idea = agent.select_date(date, profile_input)
45
+ return topic[0], papers, idea[0]
46
+
47
+ def send_text(query, profile_input):
48
+ # Simulate sending a query and receiving a response
49
+ if len(profile_input) <= 1:
50
+ Ans1, Ans2 = agent.response(query, None)
51
+ else:
52
+ Ans1, Ans2 = agent.response(query, profile_input)
53
+
54
+ return Ans1[0], Ans2[0]
55
+
56
+
57
+
58
+ def send_comment(comment):
59
+ # Simulate sending a comment
60
+ message = agent.update_comment(comment)
61
+ gr.Info("Thank you for your comment!")
62
+
63
+ return message[0]
64
+
65
+
66
+
67
+ def respond(message, chat_history, profile):
68
+
69
+ bot_message1, bot_message2 = send_text(message, profile)
70
+
71
+
72
+ # bot_message1, bot_message2 = "a", "b"
73
+ chat_history.append((message, None))
74
+ chat_history.append((bot_message1, bot_message2))
75
+
76
+ time.sleep(2)
77
+
78
+ return "", chat_history
79
+
80
+
81
+
82
+ with gr.Blocks(css="""#chat_container {height: 820px; width: 1000px; margin-left: auto; margin-right: auto;}
83
+ #chatbot {height: 600px; overflow: auto;}
84
+ #create_container {height: 750px; margin-left: 0px; margin-right: 0px;}
85
+ #tokenizer_renderer span {white-space: pre-wrap}
86
+ """,
87
+ theme="bethecloud/storj_theme",title="Arxiv Copilot") as app:
88
+ with gr.Row():
89
+ with gr.Column(scale=2):
90
+ gr.Image(
91
+ "images/arxiv_copilot.PNG", elem_id="banner-image", show_label=False
92
+ )
93
+ with gr.Column(scale=5):
94
+ gr.Markdown(
95
+ """# Arxiv Copilot
96
+ ➡️️ **Goals**: Arxiv Copilot aims to provide personalized academic service!
97
+
98
+ ✨ **Guidance**:
99
+
100
+ Step (1) Enter researcher name and generate research profile in "Set your profile!"🧑‍💼
101
+
102
+ Step (2) Select time range and get relevant topic trend and ideas in "Get research trend and ideas!"💡
103
+
104
+ Step (3) Chat with Arxiv Copilot and choose the better response from two answers in "Chat with Arxiv Copilot!"; Here we appreciate any further feedback 🎉
105
+
106
+ ⚠️ **Limitations**: We mainly provide research service related to machine learning field now, other fields will be added in the future.
107
+
108
+ 🗄️ **Disclaimer**: User behavior data will be collected for the pure research purpose. If you use this demo, you may implicitly agree to these terms.
109
+ """
110
+ )
111
+
112
+
113
+ # gr.Markdown("Provide research service using this demo.")
114
+ with gr.Accordion("Set your profile!", open=True):
115
+ gr.Markdown(
116
+ """
117
+ You can input your name in standard format to get your profile from arxiv here. Standard examples: Yoshua Bengio. Wrong examples: yoshua bengio, Yoshua bengio, yoshua Bengio.
118
+ """
119
+ )
120
+ with gr.Row():
121
+ with gr.Column(scale=2, min_width=300):
122
+ name_input = gr.Textbox(label="Input Your Name")
123
+ set_button = gr.Button("Set Profile")
124
+ profile_text = gr.Textbox(label="Generated Profile", interactive=True, scale=7, lines=5, max_lines=5)
125
+ edit_button = gr.Button("Edit Profile", scale=1)
126
+ set_button.click(set_profile, inputs=name_input, outputs=[profile_text])
127
+ edit_button.click(edit_profile, inputs=[profile_text, name_input], outputs=[profile_text])
128
+
129
+ with gr.Accordion("Get research trend and ideas!", open=True):
130
+ gr.Markdown(
131
+ """
132
+ We will give you personalized research trend and ideas if you have set your profile. Otherwise, general research trend will be provided.
133
+ """
134
+ )
135
+ with gr.Column():
136
+ with gr.Row():
137
+ with gr.Column(scale=2, min_width=300):
138
+ # gr.Dropdown(
139
+ # ["day", "week", "bird"], label="Select time range", info="Will add more animals later!"
140
+ # ),
141
+ date_choice = gr.Radio(["day", "week", "all"], label="Select Time Range", value="day")
142
+ date_button = gr.Button("Confirm")
143
+ papers_text = gr.Textbox(label="Trend Papers", interactive=False, scale=8, lines=5, max_lines=5)
144
+
145
+ with gr.Row():
146
+ topic_text = gr.Textbox(label="Topic Trend", interactive=False, scale=5, lines=7, max_lines=10)
147
+
148
+ ideas_text = gr.Textbox(label="Ideas Related to Topic Trend", interactive=False, scale=5, lines=7, max_lines=10)
149
+
150
+ date_button.click(confirm_date, inputs=[date_choice, profile_text], outputs=[topic_text, papers_text, ideas_text])
151
+
152
+ with gr.Accordion("Chat with Arxiv Copilot!", open=True):
153
+ gr.Markdown(
154
+ """
155
+ Each time we will give you two answers. If you prefer the second answer, you can click 👍 below the second answer and the first answer will be removed. If you click 👎, the second answer will be removed.
156
+ """
157
+ )
158
+ with gr.Column():
159
+ chatbot = gr.Chatbot()
160
+ with gr.Row():
161
+ msg = gr.Textbox(placeholder="Message Arxiv Copilot here...", scale=9, show_label=False)
162
+ send_button = gr.Button("Send",scale=1) # Adding a Send button
163
+ clear = gr.ClearButton([msg, chatbot],scale=1)
164
+
165
+
166
+
167
+
168
+ def print_like_dislike(x: gr.LikeData, chat_history):
169
+ cur_index = x.index[0]
170
+ if cur_index >= 1 and chat_history[cur_index - 1][1] is None:
171
+ if x.liked:
172
+ chat_history[cur_index - 1][1] = chat_history[cur_index][1]
173
+ agent.update_feedback_thought(chat_history[cur_index - 1][0], chat_history[cur_index][0], chat_history[cur_index][1], 0, 1)
174
+ # gr.Info("You like the second answer, and the fisrt answer will be removed.")
175
+
176
+ else:
177
+ agent.update_feedback_thought(chat_history[cur_index - 1][0], chat_history[cur_index][0], chat_history[cur_index][1], 1, 0)
178
+ chat_history[cur_index - 1][1] = chat_history[cur_index][0]
179
+ # gr.Info("You dislike the second answer, and the second answer will be removed.")
180
+ chat_history.remove(chat_history[cur_index])
181
+ else:
182
+ gr.Info("You have gave your feedback. You can ask more questions.")
183
+ return chat_history
184
+
185
+
186
+
187
+
188
+ msg.submit(respond, [msg, chatbot, profile_text], [msg, chatbot]) # Set up the action for the Send button
189
+ send_button.click(respond, inputs=[msg, chatbot, profile_text], outputs=[msg, chatbot])
190
+ chatbot.like(print_like_dislike, [chatbot], [chatbot])
191
+
192
+
193
+ with gr.Row():
194
+ comment_input = gr.Textbox(label="With Arxiv Copilot, how much time do you save to obtain the same amount of information?", scale=9, lines=3)
195
+ comment_button = gr.Button(value="Comment", scale=1)
196
+
197
+
198
+ comment_button.click(send_comment, inputs=comment_input, outputs=None)
199
+
200
+
201
+
202
+ app.launch()
arxiv_agent.py ADDED
@@ -0,0 +1,530 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import json
4
+ import time
5
+ import datetime
6
+ from xml.etree import ElementTree
7
+
8
+ import requests
9
+ import warnings
10
+ warnings.filterwarnings("ignore")
11
+ os.environ['KMP_DUPLICATE_LIB_OK']='True'
12
+ from utils import *
13
+ import thread6
14
+ MAX_DAILY_PAPER = 200
15
+ DAY_TIME = 60 * 60 * 24
16
+
17
+ def feedback_thought(input_ls): # preload
18
+ agent, query, ansA, ansB, feedbackA, feedbackB = input_ls
19
+ filename_thought = agent.thought_path
20
+ filename = agent.feedback_path
21
+
22
+ date = agent.today
23
+
24
+ json_data = agent.feedback
25
+ json_data_thought = agent.thought
26
+
27
+ if date in json_data:
28
+ if query not in json_data[date]:
29
+ json_data[date][query] = {}
30
+ else:
31
+ json_data[date] = {}
32
+ json_data[date][query] = {}
33
+
34
+ if date not in json_data_thought:
35
+ json_data_thought[date] = []
36
+
37
+
38
+ json_data[date][query]["answerA"] = (ansA)
39
+ json_data[date][query]["feedbackA"] = feedbackA
40
+ json_data[date][query]["answerB"] = (ansB)
41
+ json_data[date][query]["feedbackB"] = feedbackB
42
+ with open(filename,"w") as f:
43
+ json.dump(json_data,f)
44
+
45
+ preferred_ans = ""
46
+ if feedbackA == 1:
47
+ new_knowledge = response_verify([query], [ansA], verify=False)
48
+ preferred_ans = ansA
49
+ # json_data_thought[date].append(query + ansA)
50
+ else:
51
+ new_knowledge = response_verify([query], [ansB], verify=False)
52
+ preferred_ans = ansB
53
+ # json_data_thought[date].append(query + ansB)
54
+
55
+ if ('idk' not in new_knowledge[0]):
56
+
57
+ new_knowledge_embedding = get_bert_embedding(new_knowledge)
58
+ thought_embedding_all = []
59
+ for k in agent.thought_embedding.keys():
60
+ thought_embedding_all.extend(agent.thought_embedding[k])
61
+
62
+ similarity = calculate_similarity(thought_embedding_all, new_knowledge_embedding[0])
63
+
64
+ similarity_values = [s.item() for s in similarity] # Convert each tensor to a scalar
65
+ if all(s < 0.85 for s in similarity_values):
66
+ # self.update_feedback(an, answer_l_org, query)
67
+ tem_thought = query + preferred_ans
68
+ json_data_thought[date].append(tem_thought)
69
+ if date not in agent.thought_embedding:
70
+ agent.thought_embedding = {}
71
+ agent.thought_embedding[date] = [get_bert_embedding([tem_thought])[0]]
72
+ else:
73
+ agent.thought_embedding[date].append(get_bert_embedding([tem_thought])[0])
74
+
75
+ with open(filename_thought,"w") as f:
76
+ json.dump(json_data_thought,f)
77
+
78
+ with open(agent.thought_embedding_path, "wb") as f:
79
+ pickle.dump(agent.thought_embedding, f)
80
+
81
+ # return "Give feedback successfully!"
82
+
83
+ def dailyDownload(agent_ls):
84
+
85
+ agent = agent_ls[0]
86
+ while True:
87
+ time.sleep(DAY_TIME)
88
+ data_collector = []
89
+ keywords = dict()
90
+ keywords["Machine Learning"] = "Machine Learning"
91
+
92
+ for topic,keyword in keywords.items():
93
+
94
+ data, agent.newest_day = get_daily_papers(topic, query = keyword, max_results = MAX_DAILY_PAPER)
95
+ data_collector.append(data)
96
+
97
+ json_file = agent.dataset_path
98
+
99
+ update_file=update_json_file(json_file, data_collector)
100
+
101
+ time_chunks_embed={}
102
+
103
+ for data in data_collector:
104
+ for date in data.keys():
105
+ papers = data[date]['abstract']
106
+ papers_embedding=get_bert_embedding(papers)
107
+ time_chunks_embed[date.strftime("%m/%d/%Y")] = papers_embedding
108
+ update_paper_file=update_pickle_file(agent.embedding_path,time_chunks_embed)
109
+ agent.paper = update_file
110
+ agent.paper_embedding = update_paper_file
111
+ print("Today is " + agent.newest_day.strftime("%m/%d/%Y"))
112
+
113
+
114
+
115
+
116
+
117
+ class ArxivAgent:
118
+ def __init__(self):
119
+
120
+ self.dataset_path = "./dataset/paper.json"
121
+ self.thought_path = "./dataset/thought.json"
122
+
123
+ self.embedding_path = "./dataset/paper_embedding.pkl"
124
+ self.thought_embedding_path = './dataset/thought_embedding.pkl'
125
+
126
+ self.feedback_path = 'dataset/feedback.json'
127
+ self.today = datetime.datetime.now().strftime("%m/%d/%Y")
128
+
129
+ self.newest_day = ""
130
+ self.load_thought()
131
+ self.load_feedback()
132
+ self.download()
133
+ try:
134
+ thread6.run_threaded(dailyDownload, [self])
135
+ # thread6.start_new_thread( print_time, ["Thread-2", 4] )
136
+ except:
137
+ print("Error: unable to start thread")
138
+ # self.paper = self.download()
139
+ # self.paper_by_date = self.paper
140
+ def edit_profile(self, profile, author_name):
141
+ profile = profile
142
+ filename = 'dataset/profile.json'
143
+ with open(filename, 'r', encoding='utf-8') as file:
144
+ data = json.load(file)
145
+ data[author_name]=profile
146
+ with open(filename, "w") as f:
147
+ json.dump(data, f)
148
+ return "Successfully edit profile!"
149
+
150
+ def get_profile(self, author_name):
151
+ if author_name == "": return None
152
+ # import pdb
153
+ # pdb.set_trace()
154
+ profile = self.get_arxiv_data_by_author(author_name)
155
+ return profile
156
+ def select_date(self, method, profile_input):
157
+
158
+ today = self.newest_day
159
+ chunk_embedding_date={}
160
+
161
+
162
+ paper_by_date = {}
163
+ if method == "day":
164
+ offset_day = today
165
+ str_day = offset_day.strftime("%m/%d/%Y")
166
+ if str_day in self.paper:
167
+ paper_by_date[str_day] = self.paper[str_day]
168
+ chunk_embedding_date[str_day]=self.paper_embedding[str_day]
169
+
170
+ elif method == "week":
171
+ for i in range(7):
172
+ offset_day = today - datetime.timedelta(days=i)
173
+ str_day = offset_day.strftime("%m/%d/%Y")
174
+
175
+ if str_day in self.paper:
176
+ # print(str_day)
177
+ paper_by_date[str_day] = self.paper[str_day]
178
+ chunk_embedding_date[str_day] = self.paper_embedding[str_day]
179
+ else:
180
+ # import pdb
181
+ # pdb.set_trace()
182
+ paper_by_date = self.paper
183
+ chunk_embedding_date=self.paper_embedding
184
+
185
+ dataset = paper_by_date
186
+ data_chunk_embedding=chunk_embedding_date
187
+ profile = profile_input
188
+
189
+ # trend, paper_link = summarize_research_field(profile, "Machine Learning", dataset) # trend
190
+ trend, paper_link = summarize_research_field(profile, "Machine Learning", dataset,data_chunk_embedding) # trend
191
+
192
+ # import pdb
193
+ # pdb.set_trace()
194
+ reference = papertitleAndLink(paper_link)
195
+ # print("Trend:", self.trend,"\n")
196
+ idea = generate_ideas(trend) # idea
197
+
198
+ key_update = list(self.paper.keys())[-1]
199
+ if key_update not in self.thought:
200
+ self.thought[key_update] = []
201
+ if key_update not in self.thought_embedding:
202
+ self.thought_embedding[key_update] = []
203
+
204
+ self.thought[key_update].append(trend[0])
205
+ self.thought_embedding[key_update].append(get_bert_embedding([trend])[0])
206
+ self.thought[key_update].append(idea[0])
207
+ self.thought_embedding[key_update].append(get_bert_embedding([idea])[0])
208
+ # with open(self.dataset_path, "w") as f_:
209
+ # json.dump(self.paper, f_)
210
+
211
+ with open(self.thought_path, "w") as f_:
212
+ json.dump(self.thought, f_)
213
+
214
+ with open(self.thought_embedding_path, "wb") as f:
215
+ pickle.dump(self.thought_embedding, f)
216
+
217
+
218
+ return trend, reference, idea
219
+
220
+ def response(self, data, profile_input):
221
+ # dataset = self.paper_by_date
222
+
223
+ # dataset = self.paper
224
+ query = [data]
225
+ profile = profile_input
226
+
227
+ query_embedding=get_bert_embedding(query)
228
+
229
+
230
+ retrieve_text,retrieve_text_org=self.generate_pair_retrieve_text(query_embedding)
231
+
232
+ context,context_org = [retrieve_text],[retrieve_text_org]
233
+
234
+ answer_l = get_response_through_LLM_answer(query, context,profile)
235
+ answer_l_org = get_response_through_LLM_answer(query, context_org, profile)
236
+
237
+
238
+
239
+ return answer_l,answer_l_org
240
+
241
+ def generate_pair_retrieve_text(self, query_embedding):
242
+ # Access dataset
243
+ dataset = self.paper
244
+ thought = self.thought
245
+
246
+ text_chunk_l = []
247
+ chunks_embedding_text_all = []
248
+
249
+ text_org_chunk_l = []
250
+ chunks_org_embedding_text_all = []
251
+
252
+ # Include all text chunks and their embeddings
253
+ for k in dataset.keys():
254
+ text_chunk_l.extend(dataset[k]['abstract'])
255
+ chunks_embedding_text_all.extend(self.paper_embedding[k])
256
+
257
+ text_org_chunk_l.extend(dataset[k]['abstract'])
258
+ chunks_org_embedding_text_all.extend(self.paper_embedding[k])
259
+
260
+ for k in thought.keys():
261
+ if k in self.thought_embedding.keys():
262
+ text_chunk_l.extend(thought[k])
263
+ chunks_embedding_text_all.extend(self.thought_embedding[k])
264
+
265
+
266
+ # Include thoughts if not excluded
267
+
268
+ neib_all = neiborhood_search(chunks_embedding_text_all, query_embedding, num=10)
269
+ neib_all = neib_all.reshape(-1)
270
+ # import pdb
271
+ # pdb.set_trace()
272
+ # Compile retrieved text
273
+ # import pdb
274
+ # pdb.set_trace()
275
+ retrieve_text = ''.join([text_chunk_l[i] for i in neib_all])
276
+
277
+ neib_all = neiborhood_search(chunks_org_embedding_text_all, query_embedding, num=10)
278
+ neib_all = neib_all.reshape(-1)
279
+ # Compile retrieved text
280
+ retrieve_text_org = ''.join([text_org_chunk_l[i] for i in neib_all])
281
+
282
+ return retrieve_text,retrieve_text_org
283
+
284
+ def download(self):
285
+ # key_word = "Machine Learning"
286
+ data_collector = []
287
+ keywords = dict()
288
+ keywords["Machine Learning"] = "Machine Learning"
289
+
290
+ for topic,keyword in keywords.items():
291
+
292
+ data, self.newest_day = get_daily_papers(topic, query = keyword, max_results = MAX_DAILY_PAPER)
293
+ data_collector.append(data)
294
+
295
+ json_file = self.dataset_path
296
+ if not os.path.exists(json_file):
297
+ with open(json_file,'w')as a:
298
+ print("create " + json_file)
299
+
300
+ update_file=update_json_file(json_file, data_collector)
301
+
302
+ if not os.path.exists(self.embedding_path):
303
+ with open(self.embedding_path,'wb')as a:
304
+ print("create " + self.embedding_path)
305
+ time_chunks_embed={}
306
+
307
+ for data in data_collector:
308
+ for date in data.keys():
309
+ papers = data[date]['abstract']
310
+ papers_embedding=get_bert_embedding(papers)
311
+ time_chunks_embed[date.strftime("%m/%d/%Y")] = papers_embedding
312
+ update_paper_file=update_pickle_file(self.embedding_path,time_chunks_embed)
313
+ self.paper = update_file
314
+ self.paper_embedding = update_paper_file
315
+
316
+
317
+
318
+ def load_feedback(self):
319
+ filename = self.feedback_path
320
+
321
+ if os.path.exists(filename):
322
+ with open(filename,"rb") as f:
323
+ content = f.read()
324
+ if not content:
325
+ m = {}
326
+ else:
327
+ m = json.loads(content)
328
+ else:
329
+ with open(filename, mode='w', encoding='utf-8') as ff:
330
+ m = {}
331
+ self.feedback = m.copy()
332
+
333
+
334
+
335
+
336
+ def load_thought(self):
337
+ filename = self.thought_path
338
+ filename_emb = self.thought_embedding_path
339
+
340
+ if os.path.exists(filename):
341
+ with open(filename,"rb") as f:
342
+ content = f.read()
343
+ if not content:
344
+ m = {}
345
+ else:
346
+ m = json.loads(content)
347
+ else:
348
+ with open(filename, mode='w', encoding='utf-8') as ff:
349
+ m = {}
350
+
351
+
352
+ if os.path.exists(filename_emb):
353
+ with open(filename_emb,"rb") as f:
354
+ content = f.read()
355
+ if not content:
356
+ m_emb = {}
357
+ else:
358
+ m_emb = pickle.loads(content)
359
+ else:
360
+ with open(filename_emb, mode='w', encoding='utf-8') as ff:
361
+ m_emb = {}
362
+
363
+ self.thought = m.copy()
364
+ self.thought_embedding = m_emb.copy()
365
+
366
+
367
+
368
+
369
+ # for date in self.thought.keys():
370
+ # papers = data[time]['abstract']
371
+ # papers_embedding=get_bert_embedding(papers)
372
+ # time_chunks_embed[time.strftime("%m/%d/%Y")] = papers_embedding
373
+ # return
374
+ # for k in json_data.keys():
375
+ def update_feedback_thought(self, query, ansA, ansB, feedbackA, feedbackB):
376
+ try:
377
+ thread6.run_threaded(feedback_thought, [self, query, ansA, ansB, feedbackA, feedbackB])
378
+ # thread6.start_new_thread( print_time, ["Thread-2", 4] )
379
+ except:
380
+ print("Error: unable to start thread")
381
+
382
+
383
+ def update_comment(self, comment):
384
+ date = datetime.datetime.now().strftime("%m/%d/%Y")
385
+
386
+ filename = 'dataset/comment.json'
387
+ if os.path.exists(filename):
388
+ with open(filename,"r") as f:
389
+ content = f.read()
390
+ if not content:
391
+ m = {}
392
+ else:
393
+ m = json.loads(content)
394
+ else:
395
+ with open(filename, mode='w', encoding='utf-8') as ff:
396
+ m = {}
397
+
398
+
399
+ json_data = m.copy()
400
+
401
+ if date not in json_data:
402
+ json_data[date] = [comment]
403
+ else: json_data[date].append(comment)
404
+
405
+ with open(filename,"w") as f:
406
+ json.dump(json_data,f)
407
+ return "Thanks for your comment!"
408
+
409
+
410
+
411
+
412
+ def get_arxiv_data_by_author(self, author_name):
413
+
414
+
415
+
416
+ filename = 'dataset/profile.json'
417
+ if os.path.exists(filename):
418
+ with open(filename,"r") as f:
419
+ content = f.read()
420
+ if not content:
421
+ m = {}
422
+ else:
423
+ m = json.loads(content)
424
+ else:
425
+ with open(filename, mode='w', encoding='utf-8') as ff:
426
+ m = {}
427
+
428
+
429
+ json_data = m.copy()
430
+
431
+ if author_name in json_data: return json_data[author_name]
432
+
433
+ author_query = author_name.replace(" ", "+")
434
+ url = f"http://export.arxiv.org/api/query?search_query=au:{author_query}&start=0&max_results=300" # Adjust max_results if needed
435
+
436
+ response = requests.get(url)
437
+ papers_list = []
438
+
439
+ if response.status_code == 200:
440
+ root = ElementTree.fromstring(response.content)
441
+ entries = root.findall('{http://www.w3.org/2005/Atom}entry')
442
+
443
+ total_papers = 0
444
+ data_to_save = []
445
+
446
+ papers_by_year = {}
447
+
448
+ for entry in entries:
449
+
450
+ title = entry.find('{http://www.w3.org/2005/Atom}title').text.strip()
451
+ published = entry.find('{http://www.w3.org/2005/Atom}published').text.strip()
452
+ abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text.strip()
453
+ authors_elements = entry.findall('{http://www.w3.org/2005/Atom}author')
454
+ authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in authors_elements]
455
+ link = entry.find('{http://www.w3.org/2005/Atom}id').text.strip() # Get the paper link
456
+
457
+ # Check if the specified author is exactly in the authors list
458
+ if author_name in authors:
459
+ # Remove the specified author from the coauthors list for display
460
+ coauthors = [author for author in authors if author != author_name]
461
+ coauthors_str = ", ".join(coauthors)
462
+
463
+ papers_list.append({
464
+ "date": published,
465
+ "Title & Abstract": f"{title}; {abstract}",
466
+ "coauthors": coauthors_str,
467
+ "link": link # Add the paper link to the dictionary
468
+ })
469
+ authors_elements = entry.findall('{http://www.w3.org/2005/Atom}author')
470
+ authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in authors_elements]
471
+
472
+ if author_name in authors:
473
+ # print(author_name)
474
+ # print(authors)
475
+ total_papers += 1
476
+ published_date = entry.find('{http://www.w3.org/2005/Atom}published').text.strip()
477
+ date_obj = datetime.datetime.strptime(published_date, '%Y-%m-%dT%H:%M:%SZ')
478
+
479
+ year = date_obj.year
480
+ if year not in papers_by_year:
481
+ papers_by_year[year] = []
482
+ papers_by_year[year].append(entry)
483
+
484
+ if total_papers > 40:
485
+ for cycle_start in range(min(papers_by_year), max(papers_by_year) + 1, 5):
486
+ cycle_end = cycle_start + 4
487
+ for year in range(cycle_start, cycle_end + 1):
488
+ if year in papers_by_year:
489
+ selected_papers = papers_by_year[year][:2]
490
+ for paper in selected_papers:
491
+ title = paper.find('{http://www.w3.org/2005/Atom}title').text.strip()
492
+ abstract = paper.find('{http://www.w3.org/2005/Atom}summary').text.strip()
493
+ authors_elements = paper.findall('{http://www.w3.org/2005/Atom}author')
494
+ co_authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in authors_elements if author.find('{http://www.w3.org/2005/Atom}name').text != author_name]
495
+
496
+ papers_list.append({
497
+ "Author": author_name,
498
+ "Title & Abstract": f"{title}; {abstract}",
499
+ "Date Period": f"{year}",
500
+ "Cycle": f"{cycle_start}-{cycle_end}",
501
+ "Co_author": ", ".join(co_authors)
502
+ })
503
+
504
+
505
+
506
+
507
+ # Trim the list to the 10 most recent papers
508
+ papers_list = papers_list[:10]
509
+
510
+ # Prepare the data dictionary with the author's name as a key
511
+ # import pdb
512
+ # pdb.set_trace()
513
+ personal_info = "; ".join([f"{details['Title & Abstract']}" for details in papers_list])
514
+ info = summarize_research_direction(personal_info)
515
+ json_data[author_name] = info
516
+ with open(filename,"w") as f:
517
+ json.dump(json_data,f)
518
+ return json_data[author_name]
519
+
520
+ # data = {author_name: {"paper_{}".format(i+1): paper for i, paper in enumerate(papers_list)}}
521
+
522
+ else:
523
+ # print("Failed to fetch data from arXiv.")
524
+ return None
525
+
526
+
527
+
528
+
529
+
530
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ faiss-cpu
3
+ transformers
4
+ arxiv
5
+ requests
6
+ openai==0.28
7
+ thread6
utils.py ADDED
@@ -0,0 +1,740 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import openai
4
+ import faiss
5
+ from transformers import BertTokenizer, BertModel
6
+ import torch
7
+ import json
8
+ import time
9
+ import warnings
10
+ import copy
11
+ import pickle
12
+ import random
13
+ import torch.nn.functional as F
14
+
15
+ seed_value = 42
16
+ random.seed(seed_value)
17
+ np.random.seed(seed_value)
18
+ torch.manual_seed(seed_value)
19
+
20
+ warnings.filterwarnings("ignore")
21
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
22
+
23
+
24
+ KEY = os.environ['API_KEY']
25
+ openai.api_base = 'https://api.together.xyz'
26
+ llm_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
27
+
28
+ tokenizer = BertTokenizer.from_pretrained('facebook/contriever')
29
+ model = BertModel.from_pretrained('facebook/contriever').to(torch.device("cpu"))
30
+
31
+ import datetime
32
+ import json
33
+ import arxiv
34
+
35
+ def summarize_research_direction(personal_info):
36
+ prompt_qa = (
37
+ "Based on the list of the researcher's first person persona from different times, please write a comprehensive first person persona. Focus more on more rescent personas. Be concise and clear (around 300 words)."
38
+ "Here are the personas from different times: {peronalinfo}"
39
+ )
40
+
41
+ openai.api_key = KEY
42
+ input = {}
43
+ input['peronalinfo'] = personal_info
44
+ prompt = prompt_qa.format_map(input)
45
+ try:
46
+ completion = openai.ChatCompletion.create(
47
+ model=llm_model,
48
+ messages=[
49
+ {"role": "user", "content": prompt}], temperature=0.6,seed = 42, top_p=0)
50
+ except:
51
+ time.sleep(20)
52
+ completion = openai.ChatCompletion.create(
53
+ model=llm_model,
54
+ messages=[
55
+ {"role": "user", "content": prompt}], temperature=0.6,seed = 42, top_p=0)
56
+ content = completion.choices[0].message["content"]
57
+ return content
58
+
59
+ def get_authors(authors, first_author = False):
60
+ output = str()
61
+ if first_author == False:
62
+ output = ", ".join(str(author) for author in authors)
63
+ else:
64
+ output = authors[0]
65
+ return output
66
+ def sort_papers(papers):
67
+ output = dict()
68
+ keys = list(papers.keys())
69
+ keys.sort(reverse=True)
70
+ for key in keys:
71
+ output[key] = papers[key]
72
+ return output
73
+
74
+ def get_daily_papers(topic,query="slam", max_results=300):
75
+ """
76
+ @param topic: str
77
+ @param query: str
78
+ @return paper_with_code: dict
79
+ """
80
+
81
+ # output
82
+ content = dict()
83
+ Info = dict()
84
+ search_engine = arxiv.Search(
85
+ query = query,
86
+ max_results = max_results,
87
+ sort_by = arxiv.SortCriterion.SubmittedDate
88
+ )
89
+ newest_day = None
90
+ # cnt = 0
91
+ for result in search_engine.results():
92
+
93
+ # paper_id = result.get_short_id()
94
+ paper_title = result.title
95
+ paper_url = result.entry_id
96
+ # paper_abstract = result.summary
97
+
98
+ paper_abstract = result.summary.replace("\n"," ")
99
+
100
+
101
+ publish_time = result.published.date()
102
+ if newest_day is not None and not(newest_day == publish_time):
103
+
104
+ break
105
+ elif newest_day is None:
106
+ newest_day = publish_time
107
+
108
+
109
+ if publish_time in content:
110
+ content[publish_time]['abstract'].append(paper_title+ ": "+paper_abstract)
111
+ content[publish_time]['info'].append(paper_title+": "+paper_url)
112
+ # Info[publish_time].append(paper_title+": "+paper_url)
113
+ else:
114
+ content[publish_time] = {}
115
+ content[publish_time]['abstract'] = [paper_title+ ": "+paper_abstract]
116
+ content[publish_time]['info'] = [paper_title+": "+paper_url]
117
+ # cnt = cnt + 1
118
+ # content[publish_time] = [paper_abstract]
119
+ # Info[publish_time] =
120
+ # print(publish_time)
121
+ # content[paper_key] = f"|**{publish_time}**|**{paper_title}**|{paper_first_author} et.al.|[{paper_id}]({paper_url})|\n"
122
+ data = content
123
+ # print(cnt)
124
+
125
+ return data, newest_day
126
+ def papertitleAndLink(dataset):
127
+ formatted_papers = []
128
+ i = 0
129
+ # import pdb
130
+ # pdb.set_trace()
131
+ for title in dataset:
132
+
133
+ # import pdb
134
+ # pdb.set_trace()
135
+ i = i +1
136
+ formatted_papers.append("[%d] "%i + title)
137
+ # i = 0
138
+ # formatted_papers = [f"{"[%d]"%i + papers}" i = i + 1 for k in dataset.keys() for papers in dataset[k]['info']]
139
+ return ';\n'.join(formatted_papers)
140
+
141
+ def paperinfo(dataset):
142
+ # for k in dataset.keys():
143
+ formatted_papers = [f"{paper}" for k in dataset.keys() for paper in dataset[k]['abstract']]
144
+ return '; '.join(formatted_papers)
145
+
146
+ def generate_ideas (trend):
147
+ # prompt_qa = (
148
+ # "Now you are a researcher with this background {profile}, and here is a high-level summarized trend of a research field {trend}."
149
+ # "How do you view this field? Do you have any novel ideas or insights?"
150
+ # )
151
+
152
+ prompt_qa = (
153
+ "Here is a high-level summarized trend of a research field: {trend}."
154
+ "How do you view this field? Do you have any novel ideas or insights?"
155
+ "Please give me 3 to 5 novel ideas and insights in bullet points. Each bullet points should be concise, containing 2 or 3 sentences."
156
+ )
157
+
158
+ openai.api_key = KEY
159
+ content_l = []
160
+ input = {}
161
+ # input['profile'] = profile
162
+ input['trend'] = trend
163
+ prompt = prompt_qa.format_map(input)
164
+ try:
165
+ completion = openai.ChatCompletion.create(
166
+ model=llm_model,
167
+ messages=[
168
+ {"role": "user", "content": prompt}], temperature=0.6,seed = 42, top_p=0)
169
+ except:
170
+ time.sleep(20)
171
+ completion = openai.ChatCompletion.create(
172
+ model=llm_model,
173
+ messages=[
174
+ {"role": "user", "content": prompt}], temperature=0.6,seed = 42, top_p=0)
175
+ content = completion.choices[0].message["content"]
176
+ content_l.append(content)
177
+ return content_l
178
+
179
+ def summarize_research_field(profile, keywords, dataset,data_embedding):
180
+ # papers = paperinfo(dataset)
181
+ query_input = {}
182
+ input = {}
183
+ if profile is None:
184
+ prompt_qa = (
185
+ "Given the keywords, some recent paper titles and abstracts. Could you summarize no more than 10 top keywords of high level research backgounds and trends in this field."
186
+ "Here are the keywords: {keywords}"
187
+ "Here are the retrieved paper abstracts: {papers}"
188
+ )
189
+ query_format = (
190
+ "Given the keywords, retrieve some recent paper titles and abstracts can represent research trends in this field."
191
+ "Here are the keywords: {keywords}"
192
+ )
193
+ input['keywords'] = keywords
194
+ query_input['keywords'] = keywords
195
+ else:
196
+ prompt_qa = (
197
+ "Given the profile of me, some recent paper titles and abstracts. Could you summarize no more than 10 top keywords of high level research backgounds and trends in this field (related to my profile)."
198
+ "Here is my profile: {profile}"
199
+ # "Here are the keywords: {keywords}"
200
+ "Here are the retrieved paper abstracts: {papers}"
201
+ )
202
+ query_format = (
203
+ "Given the profile of me, retrieve some recent paper titles and abstracts can represent research trends related to my profile."
204
+ "Here is my profile: {profile}"
205
+ # "Here are the keywords: {keywords}"
206
+ )
207
+ input['profile'] = profile
208
+ query_input['profile'] = profile
209
+ # import pdb
210
+ # pdb.set_trace()
211
+ openai.api_key = KEY
212
+ content_l = []
213
+
214
+
215
+
216
+
217
+ query = query_format.format_map(query_input)
218
+
219
+ query_embedding=get_bert_embedding([query])
220
+ # text_chunk_l = dataset
221
+ text_chunk_l = []
222
+ data_embedding_l=[]
223
+
224
+ # with open(dataset_path, 'r', encoding='utf-8') as file:
225
+ # dataset = json.load(file)
226
+ title_chunk = []
227
+ for k in dataset.keys():
228
+ # import pdb
229
+ # pdb.set_trace()
230
+ title_chunk.extend(dataset[k]['info'])
231
+ text_chunk_l.extend(dataset[k]['abstract'])
232
+ data_embedding_l.extend(data_embedding[k])
233
+ # import pdb
234
+ # pdb.set_trace()
235
+ # print(dataset[k]['info'])
236
+
237
+ # [p if 'graph' in p else "" for p in dataset[k]['info']]
238
+ chunks_embedding_text_all = data_embedding_l
239
+ ch_text_chunk=copy.copy(text_chunk_l)
240
+ ch_text_chunk_embed=copy.copy(chunks_embedding_text_all)
241
+ num_chunk = 10
242
+ # print("raw_chunk_length: ", raw_chunk_length)
243
+
244
+ neib_all = neiborhood_search(ch_text_chunk_embed, query_embedding, num_chunk)
245
+
246
+ neib_all=neib_all.reshape(-1)
247
+
248
+ context = []
249
+ retrieve_paper = []
250
+
251
+ for i in neib_all:
252
+ context.append(ch_text_chunk[i])
253
+ # if i not in retrieve_paper:
254
+ retrieve_paper.append(title_chunk[i])
255
+ # import pdb
256
+ # pdb.set_trace()
257
+ input['papers'] = '; '.join(context)
258
+ prompt = prompt_qa.format_map(input)
259
+ # import pdb
260
+ # pdb.set_trace()
261
+ # import pdb
262
+ # pdb.set_trace()
263
+
264
+
265
+ try:
266
+ completion = openai.ChatCompletion.create(
267
+ model=llm_model,
268
+ messages=[
269
+ {"role": "user", "content": prompt}], max_tokens=512)
270
+ except:
271
+ time.sleep(20)
272
+ completion = openai.ChatCompletion.create(
273
+ model=llm_model,
274
+ messages=[
275
+ {"role": "user", "content": prompt}], max_tokens= 512)
276
+ content = completion.choices[0].message["content"]
277
+ content_l.append(content)
278
+ return content_l, retrieve_paper
279
+ def update_json_file(filename,data_all):
280
+ with open(filename,"r") as f:
281
+ content = f.read()
282
+ if not content:
283
+ m = {}
284
+ else:
285
+ m = json.loads(content)
286
+
287
+ json_data = m.copy()
288
+
289
+ # update papers in each keywords
290
+ for data in data_all:
291
+ for time in data.keys():
292
+ papers = data[time]
293
+ # print(papers.published)
294
+ json_data[time.strftime("%m/%d/%Y")] = papers
295
+ for time in json_data.keys():
296
+ papers = json_data[time]
297
+ papers['ch_abs']=copy.deepcopy(papers['abstract'])
298
+ # print(papers.published)
299
+ json_data[time] = papers
300
+ with open(filename,"w") as f_:
301
+ json.dump(json_data,f_)
302
+ return json_data
303
+
304
+ def update_pickle_file(filename, data_all):
305
+
306
+ # if os.path.exists(filename):
307
+ # with open(filename,"rb") as f:
308
+ # m = pickle.loads(f)
309
+ # with open(filename,"rb") as f:
310
+ # content = f.read()
311
+ # if not content:
312
+ # m = {}
313
+ # else:
314
+ # m = json.load(content)
315
+ with open(filename, "rb") as file:
316
+ m = pickle.load(file)
317
+ # json_data = m.copy()
318
+ # else:
319
+ # with open(filename, mode='wb', encoding='utf-8') as ff:
320
+ # m = {}
321
+
322
+ # with open(filename, "rb") as file:
323
+ # m = pickle.load(file)
324
+ pickle_data = m.copy()
325
+
326
+ for time in data_all.keys():
327
+ embeddings = data_all[time]
328
+ pickle_data[time] =embeddings
329
+ with open(filename, "wb") as f:
330
+ pickle.dump(pickle_data, f)
331
+
332
+ return pickle_data
333
+ def json_to_md(filename):
334
+ """
335
+ @param filename: str
336
+ @return None
337
+ """
338
+
339
+ DateNow = datetime.date.today()
340
+ DateNow = str(DateNow)
341
+ DateNow = DateNow.replace('-','.')
342
+
343
+ with open(filename,"r") as f:
344
+ content = f.read()
345
+ if not content:
346
+ data = {}
347
+ else:
348
+ data = json.loads(content)
349
+
350
+ md_filename = "README.md"
351
+
352
+ # clean README.md if daily already exist else create it
353
+ with open(md_filename,"w+") as f:
354
+ pass
355
+
356
+ # write data into README.md
357
+ with open(md_filename,"a+") as f:
358
+
359
+ f.write("## Updated on " + DateNow + "\n\n")
360
+
361
+ for keyword in data.keys():
362
+ day_content = data[keyword]
363
+ if not day_content:
364
+ continue
365
+ # the head of each part
366
+ f.write(f"## {keyword}\n\n")
367
+ f.write("|Publish Date|Title|Authors|PDF|\n" + "|---|---|---|---|\n")
368
+ # sort papers by date
369
+ day_content = sort_papers(day_content)
370
+
371
+ for _,v in day_content.items():
372
+ if v is not None:
373
+ f.write(v)
374
+
375
+ f.write(f"\n")
376
+ print("finished")
377
+
378
+
379
+
380
+ def neiborhood_search(corpus_data, query_data, num=8):
381
+ d = 768 # dimension
382
+ neiborhood_num = num
383
+ xq = torch.cat(query_data, 0).cpu().numpy()
384
+ xb = torch.cat(corpus_data, 0).cpu().numpy()
385
+ index = faiss.IndexFlatIP(d)
386
+ xq = xq.astype('float32')
387
+ xb = xb.astype('float32')
388
+ faiss.normalize_L2(xq)
389
+ faiss.normalize_L2(xb)
390
+ index.add(xb) # add vectors to the index
391
+ D, I = index.search(xq, neiborhood_num)
392
+
393
+ return I
394
+
395
+
396
+
397
+
398
+ def get_passage_conclusion_through_LLM(text, question):
399
+ # prompt_qa = ("Given text:{context},given question:{question},based on this text and question, summarize the above text into a passage so that it can best answer this question.")
400
+ prompt_qa = (
401
+ "Given text:{context},based on this text, summarize the above text into a passage that cannot change its original meaning.")
402
+ openai.api_key = KEY
403
+
404
+ input = {}
405
+ input['context'] = text
406
+ input['question'] = question
407
+ prompt = prompt_qa.format_map(input)
408
+ try:
409
+ completion = openai.ChatCompletion.create(
410
+ model=llm_model,
411
+ messages=[
412
+ {"role": "user", "content": prompt}], temperature=0.6, seed = 42)
413
+ except:
414
+ time.sleep(20)
415
+ completion = openai.ChatCompletion.create(
416
+ model=llm_model,
417
+ messages=[
418
+ {"role": "user", "content": prompt}], temperature=0.6, seed =42)
419
+ content = completion.choices[0].message["content"]
420
+ # print(content)
421
+ return content
422
+
423
+
424
+ def retain_useful_info(text, question):
425
+ prompt_qa = (
426
+ "Given text:{context},given question:{question},based on this text and question, summarize the text into a sentence that is most useful in answering this question.")
427
+ openai.api_key = KEY
428
+
429
+ input = {}
430
+ input['context'] = text
431
+ input['question'] = question
432
+ prompt = prompt_qa.format_map(input)
433
+ try:
434
+ completion = openai.ChatCompletion.create(
435
+ model=llm_model,
436
+ messages=[
437
+ {"role": "user", "content": prompt}])
438
+ except:
439
+ time.sleep(20)
440
+ completion = openai.ChatCompletion.create(
441
+ model=llm_model,
442
+ messages=[
443
+ {"role": "user", "content": prompt}])
444
+ content = completion.choices[0].message["content"]
445
+ # print(content)
446
+ return content
447
+
448
+
449
+ def llm_summary(text_l):
450
+ # prompt_qa = ("Given text:{context},given question:{question},based on this text and question, summarize the above text into a passage so that it can best answer this question.")
451
+ text = ''
452
+ for inter in text_l:
453
+ text += inter
454
+ prompt_qa = (
455
+ "Given text:{context},based on this text, summarize the above text into a fluent passage that cannot change its original meaning.")
456
+ openai.api_key = KEY
457
+
458
+ input = {}
459
+ input['context'] = text
460
+ prompt = prompt_qa.format_map(input)
461
+ try:
462
+ completion = openai.ChatCompletion.create(
463
+ model=llm_model,
464
+ messages=[
465
+ {"role": "user", "content": prompt}], temperature=0.6, seed =42)
466
+ except:
467
+ time.sleep(20)
468
+ completion = openai.ChatCompletion.create(
469
+ model=llm_model,
470
+ messages=[
471
+ {"role": "user", "content": prompt}], temperature=0.6, seed=42)
472
+ content = completion.choices[0].message["content"]
473
+ # print(content)
474
+ return content
475
+
476
+
477
+ def get_multi_query_through_LLM(question_data, generated_answers=None, support_material=None):
478
+ PROMPT_DICT = {
479
+ "without_answer": (
480
+ "The input will be a paragraph of text."
481
+ "Your task is to generate five as diverse, informative, and relevant, as possible versions of supporting materials, perspectives, fact. Provide these alternative materials, perspectives, fact. Each of them occupies a line."
482
+ "Original text: {question}"
483
+ "Answer:,Please output a list to split these five answers."),
484
+ "with_answer": (
485
+ "The input will be a paragraph of original text, a previously generated support material and a response for the text based on reviously generated support material by a naive agent, who may make mistakes."
486
+ "Your task is to generate five as diverse, informative, and relevant, as possible versions of supporting materials,perspectives, fact based on the the above information. Each of them occupies a line."
487
+ "Provide these alternative materials, perspectives, fact."
488
+ "Original text:{question}. "
489
+ "Previously generated support material (the text below are naive, and could be wrong, use with caution): {support_material} "
490
+ "Response:{answer}."
491
+ "Answer:,Please output a list to split these five answers."),
492
+ }
493
+ prompt_q, prompt_qa = PROMPT_DICT["without_answer"], PROMPT_DICT["with_answer"]
494
+ openai.api_key = KEY
495
+ ### question_data
496
+ inter = {}
497
+ inter['question'] = question_data
498
+ if generated_answers != None:
499
+ inter['answer'] = generated_answers
500
+ inter['support_material'] = support_material
501
+ prompt = [prompt_qa.format_map(example) for example in [inter]]
502
+ else:
503
+ prompt = [prompt_q.format_map(example) for example in [inter]]
504
+ try:
505
+ completion = openai.ChatCompletion.create(
506
+ model=llm_model,
507
+ messages=[
508
+ {"role": "user", "content": prompt[0]}], temperature=0.6, seed=42)
509
+ except:
510
+ time.sleep(20)
511
+ completion = openai.ChatCompletion.create(
512
+ model=llm_model,
513
+ messages=[
514
+ {"role": "user", "content": prompt[0]}], temperature=0.6,seed =42)
515
+ content = completion.choices[0].message["content"]
516
+ for inter_ in content:
517
+ inter_ = inter_.strip('1.').strip('2.').strip('3.').strip('4.').strip('5.')
518
+ # print(content)
519
+
520
+ return content
521
+
522
+
523
+ def get_question_through_LLM(question, context):
524
+ prompt_s = question[0]
525
+ for i in range(len(context)):
526
+ prompt_s += "Documents %d: " % (i + 1) + context[i] + '\n'
527
+
528
+ prompt_qa = (prompt_s)
529
+
530
+ openai.api_key = KEY
531
+ content_l = []
532
+ # import pdb
533
+ # pdb.set_trace()
534
+ # for inter1 in range(len(context)):
535
+
536
+ # question_i = question[0]
537
+ # context_i=context[inter1]
538
+ # input={}
539
+ # input['question']=question_i
540
+ # input['context']=context_i
541
+ prompt = prompt_qa
542
+ try:
543
+ completion = openai.ChatCompletion.create(
544
+ model=llm_model,
545
+ messages=[
546
+ {"role": "user", "content": prompt}], temperature=0.6, seed=42)
547
+ except:
548
+ time.sleep(20)
549
+ completion = openai.ChatCompletion.create(
550
+ model=llm_model,
551
+ messages=[
552
+ {"role": "user", "content": prompt}], temperature=0.6, seed=42)
553
+ content = completion.choices[0].message["content"]
554
+ content_l.append(content)
555
+ # print(content)
556
+ return content_l
557
+
558
+
559
+ def get_response_through_LLM(question, context):
560
+ prompt_qa = ("Given text: {context}, based on this text, answer the question: {question}")
561
+ openai.api_key = KEY
562
+ content_l = []
563
+ # print(len(context))
564
+ # import pdb
565
+ # pdb.set_trace()
566
+ # print()
567
+
568
+ for inter1 in range(len(question)):
569
+ question_i = question[inter1]
570
+ context_i = context[inter1]
571
+ input = {}
572
+ input['question'] = question_i
573
+ input['context'] = context_i
574
+ prompt = prompt_qa.format_map(input)
575
+ # print(prompt)
576
+ try:
577
+ completion = openai.ChatCompletion.create(
578
+ model=llm_model,
579
+ messages=[
580
+ {"role": "user", "content": prompt}], temperature=0.6,seed=42)
581
+ except:
582
+ time.sleep(20)
583
+ completion = openai.ChatCompletion.create(
584
+ model=llm_model,
585
+ messages=[
586
+ {"role": "user", "content": prompt}], temperature=0.6,seed=42)
587
+ content = completion.choices[0].message["content"]
588
+ content_l.append(content)
589
+ # print("Answer for Pre Queston ", inter1, ": ")
590
+ # print(content,"\n")
591
+ return content_l
592
+
593
+ def get_response_through_LLM_answer(question, context, profile):
594
+ # import pdb
595
+ # pdb.set_trace()
596
+ if profile is None:
597
+ prompt_qa = (
598
+ "Answer the: {question}, based on materials: {context}"
599
+ )
600
+ else:
601
+ prompt_qa = (
602
+ "Answer the: {question}, based on materials: {context} and my profile: {profile}"
603
+ )
604
+ openai.api_key = KEY
605
+ content_l = []
606
+ # print(len(context))
607
+ # import pdb
608
+ # pdb.set_trace()
609
+ # print()
610
+
611
+ # print("Length of the question: ", len(question))
612
+ # print("Length of the context: ", len(context))
613
+
614
+ for inter1 in range(len(question)):
615
+
616
+ question_i = question[inter1]
617
+ context_i = context[inter1]
618
+
619
+
620
+ input = {}
621
+ input['question'] = question_i
622
+ input['context'] = context_i
623
+ if profile is not None:
624
+ profile_i = profile
625
+ input['profile'] = profile_i
626
+ # import pdb
627
+ # pdb.set_trace()
628
+ prompt = prompt_qa.format_map(input)
629
+ # print(prompt)
630
+ try:
631
+ completion = openai.ChatCompletion.create(
632
+ model=llm_model,
633
+ messages=[
634
+ {"role": "user", "content": prompt}], temperature=0.6,seed=42)
635
+ except:
636
+ time.sleep(20)
637
+ completion = openai.ChatCompletion.create(
638
+ model=llm_model,
639
+ messages=[
640
+ {"role": "user", "content": prompt}], temperature=0.6,seed=42)
641
+ content = completion.choices[0].message["content"]
642
+ content_l.append(content)
643
+ # print(content)
644
+ return content_l
645
+
646
+ def get_response_through_LLM_cross(question, context):
647
+
648
+ prompt_s = context + '\n'
649
+
650
+ prompt_s += "Based on the above documents, answer the question: {question} in short."
651
+ prompt_qa = (prompt_s)
652
+
653
+ openai.api_key = KEY
654
+ content_l = []
655
+ for inter1 in range(len(question)):
656
+
657
+ question_i = question[inter1]
658
+ input = {}
659
+ input['question'] = question_i
660
+ prompt = prompt_qa.format_map(input)
661
+ try:
662
+ completion = openai.ChatCompletion.create(
663
+ model=llm_model,
664
+ messages=[
665
+ {"role": "user", "content": prompt}], temperature=0.6,seed=42)
666
+ except:
667
+ time.sleep(20)
668
+ completion = openai.ChatCompletion.create(
669
+ model=llm_model,
670
+ messages=[
671
+ {"role": "user", "content": prompt}], temperature=0.6,seed=42)
672
+ content = completion.choices[0].message["content"]
673
+ content_l.append(content)
674
+ # print(content)
675
+ return content_l
676
+
677
+
678
+ def get_bert_embedding(instructions):
679
+
680
+
681
+ # encoded_input_all = [tokenizer(text['instruction']+text['input'], return_tensors='pt').to(torch.device("cuda")) for text in instructions]
682
+
683
+ encoded_input_all = [tokenizer(text, return_tensors='pt', truncation=True,
684
+ max_length=512).to(torch.device("cpu")) for text in instructions]
685
+
686
+ with torch.no_grad():
687
+ emb_list = []
688
+ for inter in encoded_input_all:
689
+ emb = model(**inter)
690
+ emb_list.append(emb['last_hidden_state'].mean(1))
691
+ return emb_list
692
+
693
+ def calculate_similarity(tensor_list, input_tensor):
694
+ flattened_list = [t.flatten() for t in tensor_list]
695
+ flattened_tensor = input_tensor.flatten()
696
+ cosine_similarities = [F.cosine_similarity(flattened_tensor.unsqueeze(0), t.unsqueeze(0)) for t in flattened_list]
697
+
698
+ return cosine_similarities
699
+
700
+ def response_verify(question, context, verify = False):
701
+ if verify:
702
+ prompt_qa = (
703
+ "Input: Given question:{question}, given answer:{context}. Based on the provided question and its corresponding answer, perform the following steps:"
704
+ "Step 1: Determine if the answer is an actual answer or if it merely indicates that the question cannot be answered due to insufficient information. If the latter is true, just output 'idk' without any extra words "
705
+ "Step 2: If it is a valid answer, succinctly summarize both the question and answer into a coherent knowledge point, forming a fluent passage."
706
+ )
707
+ else:
708
+ prompt_qa = (
709
+ "Given question:{question},given answer:{context},based on the given question and corresponding answer, "
710
+ "summarize them into a knowledge point like a fluent passage.")
711
+
712
+ openai.api_key = KEY
713
+ content_l = []
714
+
715
+ for inter1 in range(len(question)):
716
+
717
+ question_i = question[inter1]
718
+ context_i = context[inter1]
719
+ input = {}
720
+ input['question'] = question_i
721
+ input['context'] = context_i
722
+ prompt = prompt_qa.format_map(input)
723
+ # print(prompt)
724
+ try:
725
+ completion = openai.ChatCompletion.create(
726
+ model=llm_model,
727
+ messages=[
728
+ {"role": "user", "content": prompt}], temperature=0.6,seed=42)
729
+ except:
730
+ time.sleep(20)
731
+ completion = openai.ChatCompletion.create(
732
+ model=llm_model,
733
+ messages=[
734
+ {"role": "user", "content": prompt}], temperature=0.6,seed=42)
735
+ content = completion.choices[0].message["content"]
736
+ content_l.append(content)
737
+ # print(content)
738
+ return content_l
739
+
740
+