Anne31415 commited on
Commit
d0ba0ce
1 Parent(s): bc8fa07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -83
app.py CHANGED
@@ -1,13 +1,8 @@
1
- import os
2
- from huggingface_hub import Repository
3
- import streamlit.components.v1 as components
4
- from datasets import load_dataset
5
- import random
6
  import pickle
7
- from nltk.tokenize import sent_tokenize
8
- import nltk
9
  from PyPDF2 import PdfReader
10
- import streamlit as st
11
  from streamlit_extras.add_vertical_space import add_vertical_space
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain.embeddings.openai import OpenAIEmbeddings
@@ -15,9 +10,7 @@ from langchain.vectorstores import FAISS
15
  from langchain.llms import OpenAI
16
  from langchain.chains.question_answering import load_qa_chain
17
  from langchain.callbacks import get_openai_callback
18
- from my_component import my_component
19
-
20
- nltk.download('punkt')
21
 
22
  # Step 1: Clone the Dataset Repository
23
  repo = Repository(
@@ -34,39 +27,49 @@ repo.git_pull() # Pull the latest changes (if any)
34
  pdf_file_path = "Private_Book/Glossar_PDF_webscraping.pdf" # Replace with your PDF file path
35
 
36
 
 
37
  # Sidebar contents
38
  with st.sidebar:
39
- st.title(':orange_book: BinDoc GmbH')
40
-
41
-
42
- api_key = os.getenv("OPENAI_API_KEY")
43
- # Retrieve the API key from st.secrets
44
-
45
-
46
- if not api_key:
47
- st.warning('API key is required to proceed.')
48
- st.stop() # Stop the app if the API key is not provided
49
 
50
- st.markdown("Experience the future of document interaction with the revolutionary")
51
  st.markdown("**BinDocs Chat App**.")
 
 
52
  st.markdown("Harnessing the power of a Large Language Model and AI technology,")
 
 
 
53
  st.markdown("this innovative platform redefines PDF engagement,")
 
54
  st.markdown("enabling dynamic conversations that bridge the gap between")
55
  st.markdown("human and machine intelligence.")
56
 
 
 
57
  add_vertical_space(3) # Add more vertical space between text blocks
58
- st.write('Made with ❤️ by BinDoc GmbH')
 
 
 
59
 
60
  def load_pdf(file_path):
61
  pdf_reader = PdfReader(file_path)
62
- chunks = []
63
  for page in pdf_reader.pages:
64
- text = page.extract_text()
65
- if text:
66
- chunks.append(text)
67
-
68
- store_name = os.path.basename(file_path)[:-4]
69
-
 
 
 
 
 
70
  if os.path.exists(f"{store_name}.pkl"):
71
  with open(f"{store_name}.pkl", "rb") as f:
72
  VectorStore = pickle.load(f)
@@ -79,86 +82,78 @@ def load_pdf(file_path):
79
  return VectorStore
80
 
81
 
82
- def load_chatbot(max_tokens=300):
83
- return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff")
84
-
85
-
86
- def display_chat_history(chat_history):
87
- for chat in chat_history:
88
- background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
89
- st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
90
-
91
- def remove_incomplete_sentences(text):
92
- sentences = sent_tokenize(text)
93
- complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
94
- return ' '.join(complete_sentences)
95
-
96
- def remove_redundant_information(text):
97
- sentences = sent_tokenize(text)
98
- unique_sentences = list(set(sentences))
99
- return ' '.join(unique_sentences)
100
-
101
- # Define a maximum token limit to avoid infinite loops
102
- MAX_TOKEN_LIMIT = 400
103
-
104
- import random
105
 
 
 
106
 
107
  def main():
108
  st.title("BinDocs Chat App")
109
 
 
 
 
 
 
 
 
110
  if "chat_history" not in st.session_state:
111
  st.session_state['chat_history'] = []
112
 
113
  display_chat_history(st.session_state['chat_history'])
114
 
 
 
 
 
115
  new_messages_placeholder = st.empty()
116
 
117
- query = st.text_input("Ask questions about your PDF file (in any preferred language):")
 
118
 
119
- if st.button("Was genau ist ein Belegarzt?"):
120
- query = "Was genau ist ein Belegarzt?"
121
- if st.button("Wofür wird die Alpha-ID verwendet?"):
122
- query = "Wofür wird die Alpha-ID verwendet?"
123
- if st.button("Was sind die Vorteile des ambulanten operierens?"):
124
- query = "Was sind die Vorteile des ambulanten operierens?"
 
 
 
125
 
126
- if query:
127
- st.session_state['last_input'] = query
128
- st.session_state['chat_history'].append(("User", query, "new"))
129
 
130
- loading_message = st.empty()
131
- loading_message.text('Bot is thinking...')
132
-
133
- VectorStore = load_pdf(pdf_file_path)
134
- max_tokens = 120
135
- chain = load_chatbot(max_tokens=max_tokens)
136
- docs = VectorStore.similarity_search(query=query, k=2)
137
-
138
- with get_openai_callback() as cb:
139
- response = chain.run(input_documents=docs, question=query)
140
 
141
- # Post-processing to remove incomplete sentences and redundant information
142
- filtered_response = remove_incomplete_sentences(response)
143
- filtered_response = remove_redundant_information(filtered_response)
144
-
145
- st.session_state['chat_history'].append(("Bot", filtered_response, "new"))
146
 
 
147
  new_messages = st.session_state['chat_history'][-2:]
148
  for chat in new_messages:
149
  background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
150
  new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
151
 
 
152
  st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)
153
 
154
  loading_message.empty()
155
 
 
156
  query = ""
157
- else:
158
- st.warning("Please enter a query before asking questions.")
159
 
160
- st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
161
-
 
 
 
 
 
 
 
162
 
163
  if __name__ == "__main__":
164
- main()
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
 
 
 
3
  import pickle
4
+ from huggingface_hub import Repository
 
5
  from PyPDF2 import PdfReader
 
6
  from streamlit_extras.add_vertical_space import add_vertical_space
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain.embeddings.openai import OpenAIEmbeddings
 
10
  from langchain.llms import OpenAI
11
  from langchain.chains.question_answering import load_qa_chain
12
  from langchain.callbacks import get_openai_callback
13
+ import os
 
 
14
 
15
  # Step 1: Clone the Dataset Repository
16
  repo = Repository(
 
27
  pdf_file_path = "Private_Book/Glossar_PDF_webscraping.pdf" # Replace with your PDF file path
28
 
29
 
30
+
31
  # Sidebar contents
32
  with st.sidebar:
33
+ st.title(':orange[BinDoc GmbH]')
34
+ st.markdown(
35
+ "Experience the future of document interaction with the revolutionary"
36
+ )
 
 
 
 
 
 
37
 
 
38
  st.markdown("**BinDocs Chat App**.")
39
+
40
+
41
  st.markdown("Harnessing the power of a Large Language Model and AI technology,")
42
+
43
+
44
+
45
  st.markdown("this innovative platform redefines PDF engagement,")
46
+
47
  st.markdown("enabling dynamic conversations that bridge the gap between")
48
  st.markdown("human and machine intelligence.")
49
 
50
+
51
+
52
  add_vertical_space(3) # Add more vertical space between text blocks
53
+ st.write('Made with ❤️ by Anne')
54
+
55
+ api_key = os.getenv("OPENAI_API_KEY")
56
+ # Retrieve the API key from st.secrets
57
 
58
  def load_pdf(file_path):
59
  pdf_reader = PdfReader(file_path)
60
+ text = ""
61
  for page in pdf_reader.pages:
62
+ text += page.extract_text()
63
+
64
+ text_splitter = RecursiveCharacterTextSplitter(
65
+ chunk_size=1000,
66
+ chunk_overlap=200,
67
+ length_function=len
68
+ )
69
+ chunks = text_splitter.split_text(text=text)
70
+
71
+ store_name, _ = os.path.splitext(os.path.basename(file_path))
72
+
73
  if os.path.exists(f"{store_name}.pkl"):
74
  with open(f"{store_name}.pkl", "rb") as f:
75
  VectorStore = pickle.load(f)
 
82
  return VectorStore
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ def load_chatbot():
87
+ return load_qa_chain(llm=OpenAI(), chain_type="stuff")
88
 
89
  def main():
90
  st.title("BinDocs Chat App")
91
 
92
+
93
+ # Directly specifying the path to the PDF file
94
+ pdf_path = pdf_file_path
95
+ if not os.path.exists(pdf_path):
96
+ st.error("File not found. Please check the file path.")
97
+ return
98
+
99
  if "chat_history" not in st.session_state:
100
  st.session_state['chat_history'] = []
101
 
102
  display_chat_history(st.session_state['chat_history'])
103
 
104
+ st.write("<!-- Start Spacer -->", unsafe_allow_html=True)
105
+ st.write("<div style='flex: 1;'></div>", unsafe_allow_html=True)
106
+ st.write("<!-- End Spacer -->", unsafe_allow_html=True)
107
+
108
  new_messages_placeholder = st.empty()
109
 
110
+ if pdf_path is not None:
111
+ query = st.text_input("Ask questions about your PDF file (in any preferred language):")
112
 
113
+ if st.button("Was genau ist ein Belegarzt?"):
114
+ query = "Was genau ist ein Belegarzt?"
115
+ if st.button("Wofür wird die Alpha-ID verwendet?"):
116
+ query = "Wofür wird die Alpha-ID verwendet?"
117
+ if st.button("Was sind die Vorteile des ambulanten operierens?"):
118
+ query = "Was sind die Vorteile des ambulanten operierens?"
119
+
120
+ if st.button("Ask") or (not st.session_state['chat_history'] and query) or (st.session_state['chat_history'] and query != st.session_state['chat_history'][-1][1]):
121
+ st.session_state['chat_history'].append(("User", query, "new"))
122
 
123
+ loading_message = st.empty()
124
+ loading_message.text('Bot is thinking...')
 
125
 
126
+ VectorStore = load_pdf(pdf_path)
127
+ chain = load_chatbot()
128
+ docs = VectorStore.similarity_search(query=query, k=3)
129
+ with get_openai_callback() as cb:
130
+ response = chain.run(input_documents=docs, question=query)
 
 
 
 
 
131
 
132
+ st.session_state['chat_history'].append(("Bot", response, "new"))
 
 
 
 
133
 
134
+ # Display new messages at the bottom
135
  new_messages = st.session_state['chat_history'][-2:]
136
  for chat in new_messages:
137
  background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
138
  new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
139
 
140
+ # Scroll to the latest response using JavaScript
141
  st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)
142
 
143
  loading_message.empty()
144
 
145
+ # Clear the input field by setting the query variable to an empty string
146
  query = ""
 
 
147
 
148
+ # Mark all messages as old after displaying
149
+ st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
150
+
151
+
152
+
153
+ def display_chat_history(chat_history):
154
+ for chat in chat_history:
155
+ background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
156
+ st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
157
 
158
  if __name__ == "__main__":
159
+ main()