Spaces:

oneusd
/

CLM-app

Sleeping

App Files Files Community

Mishra commited on May 28

Commit

e2f6427

•

1 Parent(s): 2f1e2eb

add streamlit app

Browse files

Files changed (1) hide show

streamlit_app.py +245 -0

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 30 15:33:43 2024
+@author: anubhuti.mishra
+"""
+# !pip install llama-index-vector-stores-pinecone
+# pip install pinecone-client
+# pip install llama-index-embeddings-azure-openai
+# pip install llama-index-llms-azure-openai
+# guide: https://docs.pinecone.io/docs/llamaindex
+# azure openai + llama-index guide: https://docs.llamaindex.ai/en/stable/examples/customization/llms/AzureOpenAI/
+import logging
+import sys
+import os
+import openai
+import nest_asyncio
+nest_asyncio.apply()
+import pandas as pd
+from llama_index.llms.azure_openai import AzureOpenAI
+from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
+api_key = "99ce0fb0520347829650be45470fa8d2"
+azure_endpoint = "https://datafigpt.openai.azure.com/"
+api_version = "2023-07-01-preview"
+llm = AzureOpenAI(
+    model="gpt-35-turbo",
+    deployment_name="gpt35turbo",
+    api_key=api_key,
+    azure_endpoint=azure_endpoint,
+    api_version=api_version,
+)
+# You need to deploy your own embedding model as well as your own chat completion model
+embed_model = AzureOpenAIEmbedding(
+    model="text-embedding-ada-002",
+    deployment_name="text-embedding-ada-002",
+    api_key=api_key,
+    azure_endpoint=azure_endpoint,
+    api_version=api_version,
+)
+os.environ[
+    "PINECONE_API_KEY"
+] = "58bc6b86-4add-4459-94d2-1e9cd35b6a1b"
+os.environ["OPENAI_API_KEY"] = "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"
+# Build a Pinecone Index and connect to it
+from pinecone import Pinecone
+from pinecone import ServerlessSpec
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+api_key = os.environ["PINECONE_API_KEY"]
+pc = Pinecone(api_key=api_key)
+from llama_index.vector_stores.pinecone import PineconeVectorStore
+# dependencies
+from llama_parse import LlamaParse
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, settings, StorageContext
+from llama_index.core.base import llms
+from llama_index.core.base.llms.generic_utils import get_from_param_or_env
+# error related to no generic_utils in core.llms - manually updated the base.py and utils.py scripts for embeddings library to core.base.llms
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.llms.openai import OpenAI
+# openai.api_key = os.getenv("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")
+# os.environ["OPENAI_API_KEY"] =  "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"
+# llm = OpenAI(temperature=0, model="gpt-4o")
+# embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
+from llama_index.core import Settings
+Settings.llm = llm
+Settings.embed_model = embed_model
+# generate metadata with llama-index extractor
+# guide: https://docs.llamaindex.ai/en/stable/examples/metadata_extraction/MetadataExtractionSEC.html
+from llama_index.core.schema import MetadataMode
+from llama_index.core.extractors import (
+    SummaryExtractor,
+    QuestionsAnsweredExtractor,
+    TitleExtractor,
+    KeywordExtractor,
+    BaseExtractor)
+from llama_index.legacy.extractors.metadata_extractors import EntityExtractor #moved to legacy modules
+from llama_index.core.node_parser import TokenTextSplitter
+from llama_index.core.schema import TransformComponent
+from llama_index.core.ingestion import IngestionPipeline
+# query data from Pinecone
+from llama_index.core.retrievers import VectorIndexRetriever
+from llama_index.core.vector_stores import(MetadataFilter, MetadataFilters, FilterOperator, FilterCondition)
+import numpy as np
+# initialize index - was created online
+pinecone_index = pc.Index("clm")
+# intialize vector store
+vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
+# Instantiate VectorStoreIndex object from your vector_store object
+vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
+# RAG pipeline
+from llama_index.core.query_engine import RetrieverQueryEngine
+# Pass in your retriever from above, which is configured to return the top 5 results
+# query_engine = RetrieverQueryEngine(retriever=retriever)
+import streamlit as st
+import pandas as pd
+import base64
+from llama_index.core import PromptTemplate
+# Build custom query engine
+template = (
+    "We have provided context information below. \n"
+    """
+    You are an algorithm designed for extracting information in structured formats from a vector database to answer questions.
+All information provided must be drawn solely from the information contained in the vector database.
+Do not answer questions such as "Which agency is better?" or "Who is doing better at CLM CDC or USAID?". For such questions, answer that "I am not programmed to answer such questions."
+"""
+    "--------------------\n"
+    "{context_str}"
+    "\n---------------------\n"
+    "Given this information, please answer the question: {query_str}\n"
+)
+qa_template = PromptTemplate(template) #Somewhat redundant with the kg template but still necessary to a degree
+# prompt=
+    # Generate response
+# filters = MetadataFilters(filters = [MetadataFilter(key="country", operator = FilterOperator.EQ)], value=metadata_filters)])
+retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5) #, filters=filters)
+query_engine = RetrieverQueryEngine(retriever = retriever)
+#Change default query engine prompts with our custom one
+query_engine.update_prompts(
+    {"response_synthesizer:summary_template": qa_template}
+)
+# import ragas for evaluation
+from ragas.metrics import (
+    faithfulness,
+    answer_relevancy,
+    context_precision,
+    context_recall,
+)
+from ragas.metrics.critique import harmfulness
+from ragas.evaluation import evaluate
+metrics = [
+    faithfulness,
+    answer_relevancy,
+    context_precision,
+    context_recall,
+    harmfulness,
+]
+tab1, tab2 = st.tabs(["Chatbot", "Evaluation Dashboard"])
+# app = st.container()
+# st.set_page_config(page_title="Data.FI CLM Chatbot")
+with tab1:
+    # page = st.sidebar.selectbox("Choose a page", ["Chatbot", "Evaluation Dashboard"])
+    # if page == "Chatbot":
+    st.header("Data.FI CLM Chatbot")
+    # Initialize chat history
+    if 'chat_history' not in st.session_state:
+        st.session_state['chat_history'] = []
+    # Sidebar for metadata filters
+    st.sidebar.title('Metadata Filters')
+    options = ['all', 'nigeria', 'uganda', 'lesotho', "mozambique", "lesotho", "india", 'indonesia', "ivory coast", "nepal", "south africa", "philippines"]
+    metadata_filters = st.sidebar.selectbox('Select a filter', options)
+    # User message input and send button
+    user_input = st.text_input('Enter your message')
+    if st.button('Send'):
+        # Concatenate chat history and user input
+        chat_string = ' '.join([message['content'] for message in st.session_state['chat_history']] )
+        print(chat_string)
+    # "You are a helpful assitant and will answer questions as specific as possible based on the information given to you. If you do not have the information, you will let the user know."
+        answer = query_engine.query(user_input+"The user previously asked and received the following: " + chat_string)
+        response = answer.response
+        sources = answer.source_nodes
+        # Update chat history
+        st.session_state['chat_history'].append({'role': 'user', 'content': user_input})
+        # st.session_state['chat_history'].append({'role': 'metadata_filter', 'content': metadata_filters})
+        st.session_state['chat_history'].append({'role': 'chatbot', 'content': response})
+        # Display source nodes in sidebar
+        st.sidebar.write('Source Nodes:')
+        for i in range(5):
+            st.sidebar.write(f"**Document {i+1}**: {sources[i].metadata['document_title']}")
+        # for message in st.session_state['chat_history']:
+        #     if message['role']=='user':
+        #         eval_questions = [message['content']]
+        #     else:
+        #         eval_answers = [message['content']]
+        # print(eval_questions)
+        # print(eval_answers)
+        # result = evaluate(eval_questions, metrics, eval_answers,  [])
+        # data = result.to_pandas()
+        # tab1.write(data)
+    # Display chat history
+    st.write('Chat history:')
+    for message in st.session_state['chat_history']:
+        st.write(f"**{message['role']}**: {message['content']}")
+    # Download chat history button
+    if st.button('Download Chat History'):
+        df = pd.DataFrame(st.session_state['chat_history'])
+        csv = df.to_csv(index=False)
+        b64 = base64.b64encode(csv.encode()).decode()
+        href = f'<a href="data:file/csv;base64,{b64}" download="chat_history.txt">Download Chat History</a>'
+        st.markdown(href, unsafe_allow_html=True)
+with tab2:
+    print("Evaluation Dashboard")