|
|
|
""" |
|
Created on Tue Apr 30 15:33:43 2024 |
|
|
|
@author: anubhuti.mishra |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
import sys |
|
import os |
|
import openai |
|
import nest_asyncio |
|
nest_asyncio.apply() |
|
import pandas as pd |
|
from pinecone import Pinecone |
|
from pinecone import ServerlessSpec |
|
from llama_index.vector_stores.pinecone import PineconeVectorStore |
|
from llama_index.vector_stores.pinecone import PineconeVectorStore |
|
from llama_parse import LlamaParse |
|
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, settings, StorageContext |
|
from llama_index.core.base import llms |
|
from llama_index.core.base.llms.generic_utils import get_from_param_or_env |
|
from llama_index.embeddings.openai import OpenAIEmbedding |
|
from llama_index.llms.openai import OpenAI |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pinecone_key = os.environ["pinecone_key"] |
|
pc = Pinecone(api_key=pinecone_key) |
|
openai_key = os.getenv("openai_key1") |
|
|
|
os.environ["OPENAI_API_KEY"] = openai_key |
|
|
|
llm = OpenAI(temperature=0, model="gpt-4o") |
|
embed_model = OpenAIEmbedding(model="text-embedding-ada-002") |
|
|
|
from llama_index.core import Settings |
|
|
|
Settings.llm = llm |
|
Settings.embed_model = embed_model |
|
|
|
|
|
|
|
from llama_index.core.schema import MetadataMode |
|
from llama_index.core.extractors import ( |
|
SummaryExtractor, |
|
QuestionsAnsweredExtractor, |
|
TitleExtractor, |
|
KeywordExtractor, |
|
BaseExtractor) |
|
|
|
from llama_index.legacy.extractors.metadata_extractors import EntityExtractor |
|
from llama_index.core.node_parser import TokenTextSplitter |
|
from llama_index.core.schema import TransformComponent |
|
from llama_index.core.ingestion import IngestionPipeline |
|
|
|
|
|
from llama_index.core.retrievers import VectorIndexRetriever |
|
from llama_index.core.vector_stores import(MetadataFilter, MetadataFilters, FilterOperator, FilterCondition) |
|
|
|
import numpy as np |
|
|
|
|
|
pinecone_index = pc.Index("clm") |
|
|
|
|
|
vector_store = PineconeVectorStore(pinecone_index=pinecone_index) |
|
|
|
|
|
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store) |
|
|
|
|
|
from llama_index.core.query_engine import RetrieverQueryEngine |
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import base64 |
|
|
|
from llama_index.core import PromptTemplate |
|
|
|
template = ( |
|
"We have provided context information below. \n" |
|
""" |
|
You are an algorithm designed for extracting information in structured formats from a vector database to answer questions. |
|
All information provided must be drawn solely from the information contained in the vector database. |
|
Do not answer questions such as "Which agency is better?" or "Who is doing better at CLM CDC or USAID?". For such questions, answer that "I am not programmed to answer such questions." |
|
""" |
|
"--------------------\n" |
|
"{context_str}" |
|
"\n---------------------\n" |
|
"Given this information, please answer the question: {query_str}\n" |
|
) |
|
|
|
|
|
qa_template = PromptTemplate(template) |
|
|
|
|
|
|
|
|
|
retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5) |
|
query_engine = RetrieverQueryEngine(retriever = retriever) |
|
|
|
|
|
query_engine.update_prompts( |
|
{"response_synthesizer:summary_template": qa_template} |
|
) |
|
|
|
|
|
tab1, tab2 = st.tabs(["Chatbot", "Evaluation Dashboard"]) |
|
|
|
|
|
|
|
with tab1: |
|
|
|
|
|
|
|
|
|
st.header("Data.FI CLM Chatbot") |
|
|
|
|
|
if 'chat_history' not in st.session_state: |
|
st.session_state['chat_history'] = [] |
|
|
|
|
|
st.sidebar.title('Metadata Filters') |
|
options = ['all', 'nigeria', 'uganda', 'lesotho', "mozambique", "lesotho", "india", 'indonesia', "ivory coast", "nepal", "south africa", "philippines"] |
|
metadata_filters = st.sidebar.selectbox('Select a filter', options) |
|
|
|
|
|
user_input = st.text_input('Enter your message') |
|
if st.button('Send'): |
|
|
|
chat_string = ' '.join([message['content'] for message in st.session_state['chat_history']] ) |
|
print(chat_string) |
|
|
|
|
|
answer = query_engine.query(user_input+"The user previously asked and received the following: " + chat_string) |
|
response = answer.response |
|
sources = answer.source_nodes |
|
|
|
|
|
st.session_state['chat_history'].append({'role': 'user', 'content': user_input}) |
|
|
|
st.session_state['chat_history'].append({'role': 'chatbot', 'content': response}) |
|
|
|
|
|
st.sidebar.write('Source Nodes:') |
|
for i in range(5): |
|
st.sidebar.write(f"**Document {i+1}**: {sources[i].metadata['document_title']}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.write('Chat history:') |
|
for message in st.session_state['chat_history']: |
|
st.write(f"**{message['role']}**: {message['content']}") |
|
|
|
|
|
if st.button('Download Chat History'): |
|
df = pd.DataFrame(st.session_state['chat_history']) |
|
csv = df.to_csv(index=False) |
|
b64 = base64.b64encode(csv.encode()).decode() |
|
href = f'<a href="data:file/csv;base64,{b64}" download="chat_history.txt">Download Chat History</a>' |
|
st.markdown(href, unsafe_allow_html=True) |
|
|
|
|
|
|
|
with tab2: |
|
print("Evaluation Dashboard") |
|
|
|
|