bpHigh commited on
Commit
3fff7cd
β€’
1 Parent(s): c91a05c

Upload 3 files

Browse files
Files changed (3) hide show
  1. packages.txt +0 -0
  2. requirements.txt +6 -0
  3. research_buddy_app.py +296 -0
packages.txt ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ llama-index==0.8.27
2
+ modal
3
+ python-rapidjson==1.10
4
+ clarifai==9.8.0
5
+ clarifai-grpc==9.8.0
6
+ streamlit
research_buddy_app.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index import Document
2
+ from llama_index.chat_engine import CondenseQuestionChatEngine
3
+ from llama_index.indices.vector_store import VectorIndexRetriever
4
+ from llama_index.node_parser import SimpleNodeParser
5
+ from llama_index import LangchainEmbedding, ServiceContext
6
+ from llama_index import VectorStoreIndex
7
+ from llama_index import StorageContext, load_index_from_storage
8
+ from llama_index.query_engine import RetrieverQueryEngine
9
+ from llama_index.response_synthesizers import TreeSummarize,get_response_synthesizer
10
+ from llama_index.llms import ChatMessage
11
+
12
+ from langchain.llms import Clarifai
13
+ from langchain.embeddings import ClarifaiEmbeddings
14
+
15
+
16
+ from clarifai_grpc.channel.clarifai_channel import ClarifaiChannel
17
+ from clarifai_grpc.grpc.api import resources_pb2, service_pb2, service_pb2_grpc
18
+ from clarifai_grpc.grpc.api.status import status_code_pb2
19
+
20
+
21
+ import uuid
22
+
23
+ import streamlit as st
24
+
25
+ import modal
26
+
27
+ CLARIFAI_PAT = st.secrets.CLARIFAI_PAT
28
+ MODERATION_THRESHOLD = st.secrets.MODERATION_THRESHOLD
29
+ st.set_page_config(page_title="Research Buddy: Insights and Q&A on AI Research Papers using GPT and Nougat", page_icon="🧐", layout="centered", initial_sidebar_state="auto", menu_items=None)
30
+ st.title(body="AI Research Buddy: Nougat + GPT Powered Paper Insights πŸ“šπŸ€–")
31
+ st.info("""This Application currently only works with arxiv and acl anthology web links which belong to the format:-
32
+ 1) Arxiv:- https://arxiv.org/abs/paper_unique_identifier
33
+ 2) ACL Anthology:- https://aclanthology.org/paper_unique_identifier/
34
+
35
+ This Application uses the recently released Meta Nougat Visual Transformer for processing Papers""", icon="ℹ️")
36
+ user_input = st.text_input("Enter the arxiv or acl anthology url of the paper", "https://aclanthology.org/2023.semeval-1.266/")
37
+
38
+
39
+ def initialize_session_state():
40
+ if "vector_store" not in st.session_state:
41
+ st.session_state.vector_store = None
42
+
43
+ if "messages" not in st.session_state.keys():
44
+ st.session_state.messages = [
45
+ {"role": "assistant", "content": "Ask me a question about the research paper"}
46
+ ]
47
+
48
+ if "paper_content" not in st.session_state:
49
+ st.session_state.paper_content = None
50
+
51
+ if "paper_insights" not in st.session_state:
52
+ st.session_state.paper_insights = None
53
+
54
+
55
+ initialize_session_state()
56
+
57
+
58
+ def get_paper_content(url: str) -> str:
59
+ with st.spinner(text="Using Nougat(https://facebookresearch.github.io/nougat/) to read the paper contents and get the markdown representation of the paper"):
60
+ f = modal.Function.lookup("streamlit-hack", "main")
61
+ output = f.call(url)
62
+ st.session_state.paper_content = output
63
+ return output
64
+
65
+
66
+ def index_paper_content(content: str):
67
+ with st.spinner(text="Indexing the paper – hang tight! This should take 3-5 minutes"):
68
+ try:
69
+ LLM_USER_ID = 'openai'
70
+ LLM_APP_ID = 'chat-completion'
71
+ # Change these to whatever model and text URL you want to use
72
+ LLM_MODEL_ID = 'GPT-3_5-turbo'
73
+ llm = Clarifai(pat=CLARIFAI_PAT, user_id=LLM_USER_ID, app_id=LLM_APP_ID, model_id=LLM_MODEL_ID)
74
+
75
+ documents = [Document(text=content)]
76
+ parser = SimpleNodeParser.from_defaults()
77
+
78
+ nodes = parser.get_nodes_from_documents(documents)
79
+ USER_ID = 'openai'
80
+ APP_ID = 'embed'
81
+ # Change these to whatever model and text URL you want to use
82
+ MODEL_ID = 'text-embedding-ada'
83
+ embeddings = ClarifaiEmbeddings(pat=CLARIFAI_PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
84
+ embed_model = LangchainEmbedding(embeddings)
85
+ service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
86
+ index = VectorStoreIndex(nodes, service_context=service_context)
87
+ persist_dir = uuid.uuid4().hex
88
+ st.session_state.vector_store = persist_dir
89
+ index.storage_context.persist(persist_dir=persist_dir)
90
+ return "Paper has been Indexed"
91
+
92
+ except Exception as e:
93
+ print(str(e))
94
+ return "Unable to Index the Research Paper"
95
+
96
+
97
+ def generate_insights():
98
+ with st.spinner(text="Generating insights on the paper and preparing the Chatbot"):
99
+ try:
100
+ LLM_USER_ID = 'openai'
101
+ LLM_APP_ID = 'chat-completion'
102
+ # Change these to whatever model and text URL you want to use
103
+ LLM_MODEL_ID = 'GPT-3_5-turbo'
104
+ llm = Clarifai(pat=CLARIFAI_PAT, user_id=LLM_USER_ID, app_id=LLM_APP_ID, model_id=LLM_MODEL_ID)
105
+
106
+ USER_ID = 'openai'
107
+ APP_ID = 'embed'
108
+ # Change these to whatever model and text URL you want to use
109
+ MODEL_ID = 'text-embedding-ada'
110
+ embeddings = ClarifaiEmbeddings(pat=CLARIFAI_PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
111
+ embed_model = LangchainEmbedding(embeddings)
112
+
113
+ service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
114
+
115
+ index = load_index_from_storage(
116
+ StorageContext.from_defaults(persist_dir=st.session_state.vector_store),
117
+ service_context=service_context
118
+ )
119
+
120
+ retriever = VectorIndexRetriever(
121
+ index=index,
122
+ similarity_top_k=4,
123
+ )
124
+ # configure response synthesizer
125
+ response_synthesizer = get_response_synthesizer(
126
+ response_mode="tree_summarize", service_context=service_context
127
+ )
128
+
129
+ # assemble query engine
130
+ query_engine = RetrieverQueryEngine(
131
+ retriever=retriever,
132
+ response_synthesizer=response_synthesizer,
133
+ )
134
+
135
+ response_key_insights = query_engine.query("Generate core crux insights, contributions and results of the paper as Key Topics and thier content in markdown format where each Key Topic is in bold followed by its content")
136
+
137
+ except Exception as e:
138
+ print(str(e))
139
+ response_key_insights = "Error While Generating Insights"
140
+
141
+ st.session_state.paper_insights = response_key_insights.response
142
+
143
+
144
+ if st.button("Read and Index Paper"):
145
+ paper_content = get_paper_content(url=user_input)
146
+
147
+ if st.session_state.paper_content is not None:
148
+ with st.expander("See Paper Contents"):
149
+ st.markdown(paper_content)
150
+
151
+ result = index_paper_content(content=paper_content)
152
+ st.write(result)
153
+ generate_insights()
154
+
155
+
156
+ if st.session_state.paper_content is not None:
157
+ with st.expander("See Paper Contents"):
158
+ st.markdown(st.session_state.paper_content)
159
+
160
+ if st.session_state.paper_insights is not None:
161
+ st.sidebar.title("# πŸš€ Illuminating Research Insights πŸ“œπŸ’‘")
162
+ st.sidebar.write(st.session_state.paper_insights)
163
+
164
+
165
+ def reset_conversation():
166
+ st.session_state.messages = [
167
+ {"role": "assistant", "content": "Ask me a question about the research paper"}
168
+ ]
169
+
170
+
171
+ def moderate_text(text: str) -> tuple:
172
+ MODERATION_USER_ID = 'clarifai'
173
+ MODERATION_APP_ID = 'main'
174
+ # Change these to whatever model and text URL you want to use
175
+ MODERATION_MODEL_ID = 'moderation-multilingual-text-classification'
176
+ MODERATION_MODEL_VERSION_ID = '79c2248564b0465bb96265e0c239352b'
177
+
178
+ channel = ClarifaiChannel.get_grpc_channel()
179
+ stub = service_pb2_grpc.V2Stub(channel)
180
+
181
+ metadata = (('authorization', 'Key ' + CLARIFAI_PAT),)
182
+
183
+ userDataObject = resources_pb2.UserAppIDSet(user_id=MODERATION_USER_ID, app_id=MODERATION_APP_ID)
184
+
185
+ # To use a local text file, uncomment the following lines
186
+ # with open(TEXT_FILE_LOCATION, "rb") as f:
187
+ # file_bytes = f.read()
188
+
189
+ post_model_outputs_response = stub.PostModelOutputs(
190
+ service_pb2.PostModelOutputsRequest(
191
+ user_app_id=userDataObject,
192
+ # The userDataObject is created in the overview and is required when using a PAT
193
+ model_id=MODERATION_MODEL_ID,
194
+ version_id=MODERATION_MODEL_VERSION_ID, # This is optional. Defaults to the latest model version
195
+ inputs=[
196
+ resources_pb2.Input(
197
+ data=resources_pb2.Data(
198
+ text=resources_pb2.Text(
199
+ raw=text
200
+ )
201
+ )
202
+ )
203
+ ]
204
+ ),
205
+ metadata=metadata
206
+ )
207
+ if post_model_outputs_response.status.code != status_code_pb2.SUCCESS:
208
+ print(post_model_outputs_response.status)
209
+ raise Exception("Post model outputs failed, status: " + post_model_outputs_response.status.description)
210
+
211
+ # Since we have one input, one output will exist here
212
+ output = post_model_outputs_response.outputs[0]
213
+ moderation_reasons = ""
214
+ intervention_required = False
215
+ for concept in output.data.concepts:
216
+ if concept.value > MODERATION_THRESHOLD:
217
+ moderation_reasons += concept.name + ","
218
+ intervention_required = True
219
+
220
+ return moderation_reasons, intervention_required
221
+
222
+
223
+ if st.session_state.vector_store is not None:
224
+ LLM_USER_ID = 'openai'
225
+ LLM_APP_ID = 'chat-completion'
226
+ # Change these to whatever model and text URL you want to use
227
+ LLM_MODEL_ID = 'GPT-3_5-turbo'
228
+ llm = Clarifai(pat=CLARIFAI_PAT, user_id=LLM_USER_ID, app_id=LLM_APP_ID, model_id=LLM_MODEL_ID)
229
+
230
+ USER_ID = 'openai'
231
+ APP_ID = 'embed'
232
+ # Change these to whatever model and text URL you want to use
233
+ MODEL_ID = 'text-embedding-ada'
234
+ embeddings = ClarifaiEmbeddings(pat=CLARIFAI_PAT, user_id=USER_ID, app_id=APP_ID, model_id=MODEL_ID)
235
+ embed_model = LangchainEmbedding(embeddings)
236
+
237
+ service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
238
+
239
+ index = load_index_from_storage(
240
+ StorageContext.from_defaults(persist_dir=st.session_state.vector_store),
241
+ service_context=service_context
242
+ )
243
+
244
+ retriever = VectorIndexRetriever(
245
+ index=index,
246
+ similarity_top_k=2,
247
+ )
248
+ # configure response synthesizer
249
+ response_synthesizer = get_response_synthesizer(
250
+ response_mode="tree_summarize", service_context=service_context
251
+ )
252
+
253
+ # assemble query engine
254
+ query_engine = RetrieverQueryEngine(
255
+ retriever=retriever,
256
+ response_synthesizer=response_synthesizer,
257
+ )
258
+
259
+ custom_chat_history = []
260
+ for message in st.session_state.messages:
261
+ custom_message = ChatMessage(role=message["role"], content=message["content"])
262
+ custom_chat_history.append(custom_message)
263
+
264
+ chat_engine = CondenseQuestionChatEngine.from_defaults(service_context=service_context, query_engine=query_engine,
265
+ verbose=True,
266
+ chat_history=custom_chat_history)
267
+
268
+ if prompt := st.chat_input("Your question"): # Prompt for user input and save to chat history
269
+ st.session_state.messages.append({"role": "user", "content": prompt})
270
+
271
+ st.button('Reset Chat', on_click=reset_conversation)
272
+
273
+ for message in st.session_state.messages: # Display the prior chat messages
274
+ with st.chat_message(message["role"]):
275
+ st.write(message["content"])
276
+
277
+ # If last message is not from assistant, generate a new response
278
+ if st.session_state.messages[-1]["role"] != "assistant":
279
+ with st.chat_message("assistant"):
280
+ with st.spinner("Thinking..."):
281
+ try:
282
+ reason, intervene = moderate_text(prompt)
283
+ except Exception as e:
284
+ print(str(e))
285
+ reason = ''
286
+ intervene = False
287
+ if not intervene:
288
+ response = chat_engine.chat(prompt)
289
+ st.write(response.response)
290
+ message = {"role": "assistant", "content": response.response}
291
+ st.session_state.messages.append(message) # Add response to message history
292
+ else:
293
+ response = f"This query cannot be processed as it has been detected to be {reason}"
294
+ st.write(response)
295
+ message = {"role": "assistant", "content": response.response}
296
+ st.session_state.messages.append(message)