File size: 6,202 Bytes
19252de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import streamlit as st
import openai
from openai import OpenAI
from brain import get_index_for_documents
from langchain.chains import RetrievalQA
from langchain_community.chat_models import ChatOpenAI
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
import os
from evaluation_module import RAGEvaluator

# Set the title for the Streamlit app
st.title("DocuChat with Evaluation")

# Set up the OpenAI client
client = OpenAI()
load_dotenv()  # Load variables from .env
openai.api_key = os.getenv("OPENAI_API_KEY")

# Initialize evaluator
evaluator = RAGEvaluator()

# Function to create vector database from different file types
@st.cache_resource
def create_vectordb(files, filenames, raw_texts):
    with st.spinner("Creating vector database..."):
        vectordb = get_index_for_documents(
            [file.getvalue() for file in files if file.type == "application/pdf"],
            filenames,
            [raw_text for raw_text in raw_texts.splitlines() if raw_text.strip()],
            openai.api_key
        )
    return vectordb

# Upload files using Streamlit's file uploader
uploaded_files = st.file_uploader("Upload your documents (PDF or TXT)", type=["pdf", "txt"], accept_multiple_files=True, label_visibility="hidden")

# Text area for raw text input
raw_text = st.text_area("Or enter your raw text here:", height=150)

# If files are uploaded or raw text is provided, create the vectordb and store it in the session state
if uploaded_files or raw_text:
    file_names = [file.name for file in uploaded_files] if uploaded_files else []
    st.session_state["vectordb"] = create_vectordb(uploaded_files, file_names, raw_text)

# Define the template for the chatbot prompt
prompt_template = """

    You are a helpful Assistant who answers to users questions based on multiple contexts given to you.



    Keep your answer short and to the point.

    

    The evidence is the context of the document extract with metadata. 

    

    Carefully focus on the metadata, especially 'filename' and 'page' whenever answering.

    

    Make sure to add filename and page number at the end of the sentence you are citing to.



    Also be able to give a summary based on the document extract given to you, but do not hallucinate.

        

    Reply "Not applicable" if text is irrelevant.

     

    The document content is:

    {doc_extract}

"""

# Get the current prompt from the session state or set a default value
prompt = st.session_state.get("prompt", [{"role": "system", "content": "none"}])

# Display previous chat messages
for message in prompt:
    if message["role"] != "system":
        with st.chat_message(message["role"]):
            st.write(message["content"])

# Get the user's question using Streamlit's chat input
question = st.chat_input("Ask anything")

# Handle the user's question
if question:
    vectordb = st.session_state.get("vectordb", None)
    if not vectordb:
        with st.chat_message("assistant"):
            st.write("You need to provide a PDF, TXT file, or raw text.")
            st.stop()

    # Search the vectordb for similar content to the user's question
    search_results = vectordb.similarity_search(question, k=3)
    doc_extract = "\n".join([result.page_content for result in search_results])

    # Update the prompt with the document extract
    prompt[0] = {
        "role": "system",
        "content": prompt_template.format(doc_extract=doc_extract),
    }

    # Add the user's question to the prompt and display it
    prompt.append({"role": "user", "content": question})
    with st.chat_message("user"):
        st.write(question)

    # Display an empty assistant message while waiting for the response
    with st.chat_message("assistant"):
        botmsg = st.empty()

    # Call ChatGPT with streaming and display the response as it comes
    response = []
    result = ""
    for chunk in client.chat.completions.create(
        model="gpt-3.5-turbo", messages=prompt, stream=True
    ):
        text = chunk.choices[0].delta.content
        if text is not None:
            response.append(text)
            result = "".join(response).strip()
            botmsg.write(result)

    # Add the assistant's response to the prompt
    prompt.append({"role": "assistant", "content": result})

    # Store the updated prompt in the session state
    st.session_state["prompt"] = prompt

    # Evaluation Section
    st.write("## Evaluation Results")
    if st.button("Evaluate Response"):
        if doc_extract and result:
            # Perform evaluation
            metrics = evaluator.evaluate_all(result, doc_extract)

            # Display metrics with explanations
            st.write(f"**BLEU Score**: {metrics['BLEU']:.2f}")
            st.write("BLEU measures the overlap between the generated output and reference text based on n-grams. Range: 0-100. Higher scores indicate better match.")

            st.write(f"**ROUGE-1 Score**: {metrics['ROUGE-1']:.2f}")
            st.write("ROUGE-1 measures the overlap of unigrams between the generated output and reference text. Range: 0-1. Higher scores indicate better match.")

            st.write(f"**BERT Precision**: {metrics['BERT P']:.2f}")
            st.write(f"**BERT Recall**: {metrics['BERT R']:.2f}")
            st.write(f"**BERT F1 Score**: {metrics['BERT F1']:.2f}")
            st.write("BERTScore evaluates the semantic similarity between the generated output and reference text using BERT embeddings. Range: 0-1. Higher scores indicate better semantic similarity.")

            st.write(f"**Perplexity**: {metrics['Perplexity']:.2f}")
            st.write("Perplexity measures how well a language model predicts the text. Range: 1 to ∞. Lower values indicate better fluency and coherence.")

            st.write(f"**Diversity**: {metrics['Diversity']:.2f}")
            st.write("Diversity measures the uniqueness of bigrams in the generated output. Range: 0-1. Higher values indicate more diverse and varied output.")