eagle0504 commited on
Commit
f560388
1 Parent(s): 02148f0
Files changed (3) hide show
  1. app.py +116 -0
  2. helper/utils.py +198 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import openai
4
+ import PyPDF2
5
+ import streamlit as st
6
+ from openai import OpenAI
7
+
8
+ from helper.utils import *
9
+
10
+ st.set_page_config(layout="centered", page_title="Document Search🤖📖")
11
+ st.header("Document Search🤖📖")
12
+ st.write("---")
13
+
14
+
15
+ # Streamlit sidebar setup for user interface
16
+ with st.sidebar:
17
+ # Create an expandable instruction manual section in the sidebar
18
+ with st.expander("Instruction Manual 📖"):
19
+ # Display the instruction manual for the Document Data Chatbot in a formatted markdown
20
+ st.markdown(
21
+ """
22
+ # Document Data Chatbot User Manual 🤖💊
23
+
24
+ Welcome to the Document Data Chatbot, your interactive assistant for information on the textual "Document Data". This chatbot offers quick and accurate responses to your queries. Follow these steps to interact with the chatbot:
25
+
26
+ ## Getting Started 🚀
27
+ 1. **Access the Chatbot**: Launch the Document Data Chatbot on your device.
28
+ 2. **Start Chatting**: Type your Document Data-related questions in the chat window. Questions can range from dosage to side effects.
29
+ 3. **Send Your Question**: Submit your query by clicking 'Send' or pressing 'Enter'.
30
+
31
+ ## Chatting with Document Data Chatbot 🤔💬
32
+ - **Ask Anything**: Inquiries about textual composition, usage, storage, or safety are all welcome.
33
+ - **Use Simple Language**: Clear and concise questions yield the best results.
34
+ - **Wait for the Response**: The chatbot will promptly process and answer your query.
35
+ - **Follow-Up Questions**: Feel free to ask additional or new questions anytime.
36
+
37
+ ## Tips for a Better Experience ✨
38
+ - **Be Specific**: Specific questions help in getting precise answers.
39
+ - **Check for Typing Errors**: Correct spelling ensures better understanding by the chatbot.
40
+ - **Emoji Use**: Emojis are welcome in your questions!
41
+ - **Patience is Key**: Responses may take a moment as the chatbot processes your query.
42
+
43
+ ## Support and Feedback 🤝
44
+ - **Need Help?**: Contact our support team for any issues.
45
+ - **Share Your Feedback**: Your input is valuable and helps us improve.
46
+
47
+ ## The Team Behind the App 🧑‍💻👩‍💻
48
+ - **Founders**: Learn about [Peter Yin](https://www.linkedin.com/in/peter-yin-7914ba25/) and [Yiqiao Yin](https://www.linkedin.com/in/yiqiaoyin/), the founders, on LinkedIn.
49
+
50
+ Thank you for choosing the Document Data Chatbot. We're here to provide all the information you need about Document Data efficiently. Happy chatting! 🎉💬
51
+ """
52
+ )
53
+
54
+ # File uploader widget allowing users to upload text and PDF documents
55
+ uploaded_files = st.file_uploader(
56
+ "Upload documents", accept_multiple_files=True, type=["txt", "pdf"]
57
+ )
58
+
59
+ # Clear button
60
+ clear_button = st.sidebar.button("Clear Conversation", key="clear")
61
+
62
+
63
+ # Initialize chat history
64
+ if "messages" not in st.session_state:
65
+ st.session_state.messages = []
66
+
67
+
68
+ # Reset everything
69
+ if clear_button:
70
+ st.session_state.messages = []
71
+
72
+
73
+ # Display chat messages from history on app rerun
74
+ for message in st.session_state.messages:
75
+ with st.chat_message(message["role"]):
76
+ st.markdown(message["content"])
77
+
78
+
79
+ # Check if any files have been uploaded
80
+ if uploaded_files is None:
81
+ # Display a message prompting the user to upload files
82
+ st.info("Upload files to analyze")
83
+
84
+
85
+ elif uploaded_files:
86
+ # Inform the user how many documents have been loaded
87
+ st.sidebar.write(f"{len(uploaded_files)} document(s) loaded..")
88
+
89
+ # Process the uploaded files to extract text and source information
90
+ textify_output = read_and_textify(uploaded_files)
91
+
92
+ # Separate the output into documents (text) and their corresponding sources
93
+ documents, sources = textify_output
94
+
95
+ # Call the function
96
+ query_database = list_to_nums(documents)
97
+
98
+ # Create reference table
99
+ refs_tab = query_search(
100
+ "pful for understanding federal income", documents, query_database, sources
101
+ )
102
+
103
+ # React to user input
104
+ if prompt := st.chat_input("What is up?"):
105
+ # Display user message in chat message container
106
+ st.chat_message("user").markdown(prompt)
107
+ # Add user message to chat history
108
+ st.session_state.messages.append({"role": "user", "content": prompt})
109
+
110
+ result = refs_tab
111
+
112
+ # Display assistant response in chat message container
113
+ with st.chat_message("assistant"):
114
+ st.table(result)
115
+ # Add assistant response to chat history
116
+ st.session_state.messages.append({"role": "assistant", "content": result})
helper/utils.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict, List, Tuple, Union
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import PyPDF2
7
+ from openai import OpenAI
8
+
9
+
10
+ def read_and_textify(
11
+ files: List[str],
12
+ ) -> Tuple[List[str], List[str]]:
13
+ """
14
+ Reads PDF files and extracts text from each page.
15
+
16
+ This function iterates over a list of uploaded PDF files, extracts text from each page,
17
+ and compiles a list of texts and corresponding source information.
18
+
19
+ Args:
20
+ files (List[st.uploaded_file_manager.UploadedFile]): A list of uploaded PDF files.
21
+
22
+ Returns:
23
+ Tuple[List[str], List[str]]: A tuple containing two lists:
24
+ 1. A list of strings, where each string is the text extracted from a PDF page.
25
+ 2. A list of strings indicating the source of each text (file name and page number).
26
+ """
27
+
28
+ # Initialize lists to store extracted texts and their sources
29
+ text_list = [] # List to store extracted text
30
+ sources_list = [] # List to store source information
31
+
32
+ # Iterate over each file
33
+ for file in files:
34
+ pdfReader = PyPDF2.PdfReader(file) # Create a PDF reader object
35
+ # Iterate over each page in the PDF
36
+ for i in range(len(pdfReader.pages)):
37
+ pageObj = pdfReader.pages[i] # Get the page object
38
+ text = pageObj.extract_text() # Extract text from the page
39
+ pageObj.clear() # Clear the page object (optional, for memory management)
40
+ text_list.append(text) # Add extracted text to the list
41
+ # Create a source identifier and add it to the list
42
+ sources_list.append(file.name + "_page_" + str(i))
43
+
44
+ # Return the lists of texts and sources
45
+ return [text_list, sources_list]
46
+
47
+
48
+ client = OpenAI(api_key=os.environ("OPENAI_API_KEY"))
49
+
50
+
51
+ def list_to_nums(sentences: List[str]) -> List[List[float]]:
52
+ """
53
+ Converts a list of sentences into a list of numerical embeddings using OpenAI's embedding model.
54
+
55
+ Args:
56
+ - sentences (List[str]): A list of sentences (strings).
57
+
58
+ Returns:
59
+ - List[List[float]]: A list of lists of numerical embeddings.
60
+ """
61
+
62
+ # Initialize the list to store embeddings
63
+ embeddings = []
64
+
65
+ # Loop through each sentence to convert to embeddings
66
+ for sentence in sentences:
67
+ # Use the OpenAI API to get embeddings for the sentence
68
+
69
+ response = client.embeddings.create(
70
+ input=sentence, model="text-embedding-3-small"
71
+ )
72
+
73
+ embeddings.append(response.data[0].embedding)
74
+
75
+ return embeddings
76
+
77
+
78
+ def quantize_to_kbit(arr: Union[np.ndarray, Any], k: int = 16) -> np.ndarray:
79
+ """Converts an array to a k-bit representation by normalizing and scaling its values.
80
+
81
+ Args:
82
+ arr (Union[np.ndarray, Any]): The input array to be quantized.
83
+ k (int): The number of levels to quantize to. Defaults to 16 for 4-bit quantization.
84
+ Returns:
85
+ np.ndarray: The quantized array with values scaled to 0 to k-1.
86
+ """
87
+ if not isinstance(arr, np.ndarray): # Check if input is not a numpy array
88
+ arr = np.array(arr) # Convert input to a numpy array
89
+ arr_min = arr.min() # Calculate the minimum value in the array
90
+ arr_max = arr.max() # Calculate the maximum value in the array
91
+ normalized_arr = (arr - arr_min) / (
92
+ arr_max - arr_min
93
+ ) # Normalize array values to [0, 1]
94
+ return np.round(normalized_arr * (k - 1)).astype(
95
+ int
96
+ ) # Scale normalized values to 0-(k-1) and convert to integer
97
+
98
+
99
+ def quantized_influence(
100
+ arr1: np.ndarray, arr2: np.ndarray, k: int = 16, use_dagger: bool = False
101
+ ) -> Tuple[float, List[float]]:
102
+ """
103
+ Calculates a weighted measure of influence based on quantized version of input arrays and optionally applies a transformation.
104
+
105
+ Args:
106
+ arr1 (np.ndarray): First input array to be quantized and analyzed.
107
+ arr2 (np.ndarray): Second input array to be quantized and used for influence measurement.
108
+ k (int): The quantization level, defaults to 16 for 4-bit quantization.
109
+ use_dagger (bool): Flag to apply a transformation based on local averages, defaults to False.
110
+ Returns:
111
+ Tuple[float, List[float]]: A tuple containing the quantized influence measure and an optional list of transformed values based on local estimates.
112
+ """
113
+ # Quantize both arrays to k levels
114
+ arr1_quantized = quantize_to_kbit(arr1, k)
115
+ arr2_quantized = quantize_to_kbit(arr2, k)
116
+
117
+ # Find unique quantized values in arr1
118
+ unique_values = np.unique(arr1_quantized)
119
+
120
+ # Compute the global average of quantized arr2
121
+ total_samples = len(arr2_quantized)
122
+ y_bar_global = np.mean(arr2_quantized)
123
+
124
+ # Compute weighted local averages and normalize
125
+ weighted_local_averages = [
126
+ (np.mean(arr2_quantized[arr1_quantized == val]) - y_bar_global) ** 2
127
+ * len(arr2_quantized[arr1_quantized == val]) ** 2
128
+ for val in unique_values
129
+ ]
130
+ qim = np.sum(weighted_local_averages) / (
131
+ total_samples * np.std(arr2_quantized)
132
+ ) # Calculate the quantized influence measure
133
+
134
+ if use_dagger:
135
+ # If use_dagger is True, compute local estimates and map them to unique quantized values
136
+ local_estimates = [
137
+ np.mean(arr2_quantized[arr1_quantized == val]) for val in unique_values
138
+ ]
139
+ daggers = {
140
+ unique_values[i]: v for i, v in enumerate(local_estimates)
141
+ } # Map unique values to local estimates
142
+
143
+ def find_val_(i: int) -> float:
144
+ """Helper function to map quantized values to their local estimates."""
145
+ return daggers[i]
146
+
147
+ # Apply transformation based on local estimates
148
+ daggered_values = list(map(find_val_, arr1_quantized))
149
+ return qim, daggered_values
150
+ else:
151
+ # If use_dagger is False, return the original quantized arr1 values
152
+ daggered_values = arr1_quantized.tolist()
153
+ return qim
154
+
155
+
156
+ def query_search(
157
+ prompt: str,
158
+ sentences: list[str],
159
+ query_database: list[list[float]],
160
+ sources: list[str],
161
+ ) -> pd.DataFrame:
162
+ """
163
+ Takes a text prompt and searches a predefined database by converting the prompt
164
+ and database entries to embeddings, and then calculating a quantized influence metric.
165
+
166
+ Args:
167
+ - prompt (str): A text prompt to search for in the database.
168
+
169
+ Returns:
170
+ - pd.DataFrame: A pandas DataFrame sorted by the quantized influence metric in descending order.
171
+ The DataFrame contains the original sentences, their embeddings, and the computed scores.
172
+ """
173
+ # Convert the prompt to its numerical embedding
174
+ prompt_embed_ = list_to_nums([prompt])
175
+
176
+ # Calculate scores for each item in the database using the quantized influence metric
177
+ scores = [
178
+ [
179
+ sentences[i], # The sentence itself
180
+ query_database[i], # Embedding of the sentence
181
+ sources[i], # Source of the sentence
182
+ quantized_influence(
183
+ prompt_embed_[0], query_database[i], k=3, use_dagger=False
184
+ ), # Score calculation
185
+ ]
186
+ for i in range(len(query_database))
187
+ ]
188
+
189
+ # Convert the list of scores into a DataFrame
190
+ refs = pd.DataFrame(scores)
191
+ # Rename columns for clarity
192
+ refs = refs.rename(
193
+ columns={0: "sentences", 1: "query_embeddings", 2: "page no", 3: "qim"}
194
+ )
195
+ # Sort the DataFrame based on the 'qim' score in descending order
196
+ refs = refs.sort_values(by="qim", ascending=False)
197
+
198
+ return refs
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit
2
+ PyPDF2
3
+ openai