Mishra commited on
Commit
e2f6427
1 Parent(s): 2f1e2eb

add streamlit app

Browse files
Files changed (1) hide show
  1. streamlit_app.py +245 -0
streamlit_app.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Tue Apr 30 15:33:43 2024
4
+
5
+ @author: anubhuti.mishra
6
+ """
7
+
8
+ # !pip install llama-index-vector-stores-pinecone
9
+ # pip install pinecone-client
10
+ # pip install llama-index-embeddings-azure-openai
11
+ # pip install llama-index-llms-azure-openai
12
+ # guide: https://docs.pinecone.io/docs/llamaindex
13
+ # azure openai + llama-index guide: https://docs.llamaindex.ai/en/stable/examples/customization/llms/AzureOpenAI/
14
+ import logging
15
+ import sys
16
+ import os
17
+ import openai
18
+ import nest_asyncio
19
+ nest_asyncio.apply()
20
+ import pandas as pd
21
+ from llama_index.llms.azure_openai import AzureOpenAI
22
+ from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
23
+
24
+ api_key = "99ce0fb0520347829650be45470fa8d2"
25
+ azure_endpoint = "https://datafigpt.openai.azure.com/"
26
+ api_version = "2023-07-01-preview"
27
+
28
+ llm = AzureOpenAI(
29
+ model="gpt-35-turbo",
30
+ deployment_name="gpt35turbo",
31
+ api_key=api_key,
32
+ azure_endpoint=azure_endpoint,
33
+ api_version=api_version,
34
+ )
35
+
36
+ # You need to deploy your own embedding model as well as your own chat completion model
37
+ embed_model = AzureOpenAIEmbedding(
38
+ model="text-embedding-ada-002",
39
+ deployment_name="text-embedding-ada-002",
40
+ api_key=api_key,
41
+ azure_endpoint=azure_endpoint,
42
+ api_version=api_version,
43
+ )
44
+
45
+ os.environ[
46
+ "PINECONE_API_KEY"
47
+ ] = "58bc6b86-4add-4459-94d2-1e9cd35b6a1b"
48
+
49
+ os.environ["OPENAI_API_KEY"] = "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"
50
+
51
+ # Build a Pinecone Index and connect to it
52
+
53
+ from pinecone import Pinecone
54
+ from pinecone import ServerlessSpec
55
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
56
+
57
+ api_key = os.environ["PINECONE_API_KEY"]
58
+ pc = Pinecone(api_key=api_key)
59
+
60
+ from llama_index.vector_stores.pinecone import PineconeVectorStore
61
+
62
+
63
+ # dependencies
64
+ from llama_parse import LlamaParse
65
+ from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, settings, StorageContext
66
+ from llama_index.core.base import llms
67
+
68
+ from llama_index.core.base.llms.generic_utils import get_from_param_or_env
69
+
70
+ # error related to no generic_utils in core.llms - manually updated the base.py and utils.py scripts for embeddings library to core.base.llms
71
+ from llama_index.embeddings.openai import OpenAIEmbedding
72
+ from llama_index.llms.openai import OpenAI
73
+
74
+
75
+
76
+ # openai.api_key = os.getenv("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")
77
+ # os.environ["OPENAI_API_KEY"] = "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"
78
+ # llm = OpenAI(temperature=0, model="gpt-4o")
79
+ # embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
80
+
81
+ from llama_index.core import Settings
82
+
83
+ Settings.llm = llm
84
+ Settings.embed_model = embed_model
85
+
86
+ # generate metadata with llama-index extractor
87
+ # guide: https://docs.llamaindex.ai/en/stable/examples/metadata_extraction/MetadataExtractionSEC.html
88
+ from llama_index.core.schema import MetadataMode
89
+ from llama_index.core.extractors import (
90
+ SummaryExtractor,
91
+ QuestionsAnsweredExtractor,
92
+ TitleExtractor,
93
+ KeywordExtractor,
94
+ BaseExtractor)
95
+
96
+ from llama_index.legacy.extractors.metadata_extractors import EntityExtractor #moved to legacy modules
97
+ from llama_index.core.node_parser import TokenTextSplitter
98
+ from llama_index.core.schema import TransformComponent
99
+ from llama_index.core.ingestion import IngestionPipeline
100
+
101
+ # query data from Pinecone
102
+ from llama_index.core.retrievers import VectorIndexRetriever
103
+ from llama_index.core.vector_stores import(MetadataFilter, MetadataFilters, FilterOperator, FilterCondition)
104
+
105
+ import numpy as np
106
+
107
+ # initialize index - was created online
108
+ pinecone_index = pc.Index("clm")
109
+
110
+ # intialize vector store
111
+ vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
112
+
113
+ # Instantiate VectorStoreIndex object from your vector_store object
114
+ vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
115
+
116
+ # RAG pipeline
117
+ from llama_index.core.query_engine import RetrieverQueryEngine
118
+
119
+ # Pass in your retriever from above, which is configured to return the top 5 results
120
+ # query_engine = RetrieverQueryEngine(retriever=retriever)
121
+
122
+ import streamlit as st
123
+ import pandas as pd
124
+ import base64
125
+
126
+ from llama_index.core import PromptTemplate
127
+ # Build custom query engine
128
+ template = (
129
+ "We have provided context information below. \n"
130
+ """
131
+ You are an algorithm designed for extracting information in structured formats from a vector database to answer questions.
132
+ All information provided must be drawn solely from the information contained in the vector database.
133
+ Do not answer questions such as "Which agency is better?" or "Who is doing better at CLM CDC or USAID?". For such questions, answer that "I am not programmed to answer such questions."
134
+ """
135
+ "--------------------\n"
136
+ "{context_str}"
137
+ "\n---------------------\n"
138
+ "Given this information, please answer the question: {query_str}\n"
139
+ )
140
+
141
+
142
+ qa_template = PromptTemplate(template) #Somewhat redundant with the kg template but still necessary to a degree
143
+
144
+ # prompt=
145
+ # Generate response
146
+ # filters = MetadataFilters(filters = [MetadataFilter(key="country", operator = FilterOperator.EQ)], value=metadata_filters)])
147
+ retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5) #, filters=filters)
148
+ query_engine = RetrieverQueryEngine(retriever = retriever)
149
+
150
+ #Change default query engine prompts with our custom one
151
+ query_engine.update_prompts(
152
+ {"response_synthesizer:summary_template": qa_template}
153
+ )
154
+
155
+
156
+ # import ragas for evaluation
157
+ from ragas.metrics import (
158
+ faithfulness,
159
+ answer_relevancy,
160
+ context_precision,
161
+ context_recall,
162
+ )
163
+ from ragas.metrics.critique import harmfulness
164
+ from ragas.evaluation import evaluate
165
+
166
+ metrics = [
167
+ faithfulness,
168
+ answer_relevancy,
169
+ context_precision,
170
+ context_recall,
171
+ harmfulness,
172
+ ]
173
+
174
+ tab1, tab2 = st.tabs(["Chatbot", "Evaluation Dashboard"])
175
+
176
+ # app = st.container()
177
+ # st.set_page_config(page_title="Data.FI CLM Chatbot")
178
+ with tab1:
179
+ # page = st.sidebar.selectbox("Choose a page", ["Chatbot", "Evaluation Dashboard"])
180
+
181
+ # if page == "Chatbot":
182
+
183
+ st.header("Data.FI CLM Chatbot")
184
+
185
+ # Initialize chat history
186
+ if 'chat_history' not in st.session_state:
187
+ st.session_state['chat_history'] = []
188
+
189
+ # Sidebar for metadata filters
190
+ st.sidebar.title('Metadata Filters')
191
+ options = ['all', 'nigeria', 'uganda', 'lesotho', "mozambique", "lesotho", "india", 'indonesia', "ivory coast", "nepal", "south africa", "philippines"]
192
+ metadata_filters = st.sidebar.selectbox('Select a filter', options)
193
+
194
+ # User message input and send button
195
+ user_input = st.text_input('Enter your message')
196
+ if st.button('Send'):
197
+ # Concatenate chat history and user input
198
+ chat_string = ' '.join([message['content'] for message in st.session_state['chat_history']] )
199
+ print(chat_string)
200
+
201
+ # "You are a helpful assitant and will answer questions as specific as possible based on the information given to you. If you do not have the information, you will let the user know."
202
+ answer = query_engine.query(user_input+"The user previously asked and received the following: " + chat_string)
203
+ response = answer.response
204
+ sources = answer.source_nodes
205
+
206
+ # Update chat history
207
+ st.session_state['chat_history'].append({'role': 'user', 'content': user_input})
208
+ # st.session_state['chat_history'].append({'role': 'metadata_filter', 'content': metadata_filters})
209
+ st.session_state['chat_history'].append({'role': 'chatbot', 'content': response})
210
+
211
+ # Display source nodes in sidebar
212
+ st.sidebar.write('Source Nodes:')
213
+ for i in range(5):
214
+ st.sidebar.write(f"**Document {i+1}**: {sources[i].metadata['document_title']}")
215
+
216
+ # for message in st.session_state['chat_history']:
217
+ # if message['role']=='user':
218
+ # eval_questions = [message['content']]
219
+ # else:
220
+ # eval_answers = [message['content']]
221
+
222
+ # print(eval_questions)
223
+ # print(eval_answers)
224
+ # result = evaluate(eval_questions, metrics, eval_answers, [])
225
+ # data = result.to_pandas()
226
+ # tab1.write(data)
227
+
228
+ # Display chat history
229
+ st.write('Chat history:')
230
+ for message in st.session_state['chat_history']:
231
+ st.write(f"**{message['role']}**: {message['content']}")
232
+
233
+ # Download chat history button
234
+ if st.button('Download Chat History'):
235
+ df = pd.DataFrame(st.session_state['chat_history'])
236
+ csv = df.to_csv(index=False)
237
+ b64 = base64.b64encode(csv.encode()).decode()
238
+ href = f'<a href="data:file/csv;base64,{b64}" download="chat_history.txt">Download Chat History</a>'
239
+ st.markdown(href, unsafe_allow_html=True)
240
+
241
+
242
+
243
+ with tab2:
244
+ print("Evaluation Dashboard")
245
+