Zenne commited on
Commit
fe1526d
·
1 Parent(s): 9afeebb

initial commit

Browse files
Files changed (4) hide show
  1. README.md +39 -13
  2. app.py +213 -0
  3. requirements.txt +5 -0
  4. tmp_docs/empty.txt +0 -0
README.md CHANGED
@@ -1,13 +1,39 @@
1
- ---
2
- title: Chatbot For Files Langchain
3
- emoji: ⚡
4
- colorFrom: yellow
5
- colorTo: pink
6
- sdk: streamlit
7
- sdk_version: 1.19.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ This is a chatbot that uses Langchain's Conversational Retrieval Chain to generate responses to user input. The chatbot can ingest files and use Pinecone (Pinecone API key required) or Chroma vector stores (no API key required) to retrieve relevant documents for generating responses. OpenAI's API key is also required. The UI is based on Streamlit.
3
+
4
+ ## Fun fact
5
+ This README file is generated by this app after ingesting this python file. See the screenshot below.
6
+
7
+ ## Installation
8
+
9
+ To install the required packages, run:
10
+
11
+ ```
12
+ pip install -r requirements.txt
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ To run the chatbot, run:
18
+
19
+ ```
20
+ streamlit run app.py
21
+ ```
22
+
23
+ The chatbot will prompt the user for inputs and generate a response based on user's question and the chat history.
24
+
25
+ ## Ingesting Files
26
+
27
+ To ingest files, select "Yes" when prompted and upload the files. The chatbot will split the files into smaller documents and ingest them into the vector store.
28
+
29
+ ## Using Pinecone
30
+
31
+ To use Pinecone, select "Yes" when prompted and enter the name of the Pinecone index. Make sure to set the `PINECONE_API_KEY` and `PINECONE_API_ENV` environment variables.
32
+
33
+ ## Using Chroma
34
+
35
+ To use Chroma, enter the name of the Chroma collection when prompted. The chatbot will create a Chroma vector store in the `persist_directory` specified in the code.
36
+
37
+
38
+ ## Screenshot
39
+ ![chat](https://github.com/eliujl/chatbot_for_files_UI/assets/8711788/1353e575-b813-4d93-9e44-ed625002f0ae)
app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import required libraries
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.document_loaders import (
4
+ UnstructuredWordDocumentLoader,
5
+ PyMuPDFLoader,
6
+ UnstructuredFileLoader,
7
+ )
8
+ from langchain.embeddings.openai import OpenAIEmbeddings
9
+ from langchain.chat_models import ChatOpenAI
10
+ from langchain.vectorstores import Pinecone, Chroma
11
+ from langchain.chains import ConversationalRetrievalChain
12
+ import os
13
+ import pinecone
14
+ import streamlit as st
15
+ import shutil
16
+
17
+ # Set up OpenAI API key (from .bashrc, Windows environment variables, .env)
18
+ OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
19
+
20
+ # Set up Pinecone env
21
+ PINECONE_API_KEY = os.environ['PINECONE_API_KEY']
22
+ PINECONE_API_ENV = os.environ['PINECONE_API_ENV']
23
+ pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
24
+
25
+ pinecone_index_name = ''
26
+ chroma_collection_name = ''
27
+ persist_directory = ''
28
+ chat_history = []
29
+ docsearch_ready = False
30
+ directory_name = 'tmp_docs'
31
+
32
+
33
+ def save_file(files):
34
+ # Remove existing files in the directory
35
+ if os.path.exists(directory_name):
36
+ for filename in os.listdir(directory_name):
37
+ file_path = os.path.join(directory_name, filename)
38
+ try:
39
+ if os.path.isfile(file_path):
40
+ os.remove(file_path)
41
+ except Exception as e:
42
+ print(f"Error: {e}")
43
+ # Save the new file with original filename
44
+ if files is not None:
45
+ for file in files:
46
+ file_name = file.name
47
+ file_path = os.path.join(directory_name, file_name)
48
+ with open(file_path, 'wb') as f:
49
+ shutil.copyfileobj(file, f)
50
+
51
+
52
+ def load_files():
53
+ file_path = "./tmp_docs/"
54
+ all_texts = []
55
+ n_files = 0
56
+ n_char = 0
57
+ n_texts = 0
58
+
59
+ text_splitter = RecursiveCharacterTextSplitter(
60
+ chunk_size=400, chunk_overlap=50
61
+ )
62
+ for filename in os.listdir(directory_name):
63
+ file = os.path.join(directory_name, filename)
64
+ if os.path.isfile(file):
65
+ if file.endswith(".docx"):
66
+ loader = UnstructuredWordDocumentLoader(file)
67
+ elif file.endswith(".pdf"):
68
+ loader = PyMuPDFLoader(file)
69
+ else: # assume a pure text format and attempt to load it
70
+ loader = UnstructuredFileLoader(file)
71
+ data = loader.load()
72
+ texts = text_splitter.split_documents(data)
73
+ n_files += 1
74
+ n_char += len(data[0].page_content)
75
+ n_texts += len(texts)
76
+ all_texts.extend(texts)
77
+ st.write(
78
+ f"Loaded {n_files} file(s) with {n_char} characters, and split into {n_texts} split-documents."
79
+ )
80
+ return all_texts, n_texts
81
+
82
+
83
+ def ingest(all_texts, use_pinecone, embeddings, pinecone_index_name, chroma_collection_name, persist_directory):
84
+ if use_pinecone:
85
+ docsearch = Pinecone.from_texts(
86
+ [t.page_content for t in all_texts], embeddings, index_name=pinecone_index_name) # add namespace=pinecone_namespace if provided
87
+ else:
88
+ docsearch = Chroma.from_documents(
89
+ all_texts, embeddings, collection_name=chroma_collection_name, persist_directory=persist_directory)
90
+ return docsearch
91
+
92
+
93
+ def setup_retriever(docsearch, k):
94
+ retriever = docsearch.as_retriever(
95
+ search_type="similarity", search_kwargs={"k": k}, include_metadata=True)
96
+ return retriever
97
+
98
+
99
+ def setup_docsearch(use_pinecone, pinecone_index_name, embeddings, chroma_collection_name, persist_directory):
100
+ docsearch = []
101
+ n_texts = 0
102
+ if use_pinecone:
103
+ # Load the pre-created Pinecone index.
104
+ # The index which has already be stored in pinecone.io as long-term memory
105
+ if pinecone_index_name in pinecone.list_indexes():
106
+ docsearch = Pinecone.from_existing_index(
107
+ pinecone_index_name, embeddings) # add namespace=pinecone_namespace if provided
108
+ index_client = pinecone.Index(pinecone_index_name)
109
+ # Get the index information
110
+ index_info = index_client.describe_index_stats()
111
+ namespace_name = ''
112
+ n_texts = index_info['namespaces'][namespace_name]['vector_count']
113
+ else:
114
+ raise ValueError('''Cannot find the specified Pinecone index.
115
+ Create one in pinecone.io or using, e.g.,
116
+ pinecone.create_index(
117
+ name=index_name, dimension=1536, metric="cosine", shards=1)''')
118
+ else:
119
+ docsearch = Chroma(persist_directory=persist_directory, embedding_function=embeddings,
120
+ collection_name=chroma_collection_name)
121
+ n_texts = docsearch._client._count(
122
+ collection_name=chroma_collection_name)
123
+ return docsearch, n_texts
124
+
125
+
126
+ def get_response(query, chat_history):
127
+ result = CRqa({"question": query, "chat_history": chat_history})
128
+ return result['answer'], result['source_documents']
129
+
130
+
131
+ def setup_em_llm(OPENAI_API_KEY):
132
+ # Set up OpenAI embeddings
133
+ embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
134
+ # Use Open AI LLM with gpt-3.5-turbo.
135
+ # Set the temperature to be 0 if you do not want it to make up things
136
+ llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", streaming=True,
137
+ openai_api_key=OPENAI_API_KEY)
138
+ return embeddings, llm
139
+
140
+
141
+ # Get user input of whether to use Pinecone or not
142
+ col1, col2, col3, col4 = st.columns([1, 1, 1, 1])
143
+ # create the radio buttons and text input fields
144
+ with col1:
145
+ r_pinecone = st.radio('Do you want to use Pinecone index?', ('Yes', 'No'))
146
+ with col2:
147
+ r_ingest = st.radio(
148
+ 'Do you want to ingest the file(s)?', ('Yes', 'No'))
149
+ with col3:
150
+ OPENAI_API_KEY = st.text_input(
151
+ "Enter your OpenAI API key and press Enter", type="password")
152
+ with col4:
153
+ if OPENAI_API_KEY:
154
+ embeddings, llm = setup_em_llm(OPENAI_API_KEY)
155
+ if r_pinecone.lower() == 'yes' and PINECONE_API_KEY != '':
156
+ use_pinecone = True
157
+ pinecone_index_name = st.text_input('Enter your Pinecone index')
158
+ else:
159
+ use_pinecone = False
160
+ chroma_collection_name = st.text_input(
161
+ '''Not using Pinecone or empty Pinecone API key provided.
162
+ Using Chroma. Enter Chroma collection name of 3-63 characters:''')
163
+ persist_directory = "./vectorstore"
164
+
165
+ if pinecone_index_name or chroma_collection_name:
166
+ if r_ingest.lower() == 'yes':
167
+ files = st.file_uploader('Upload Files', accept_multiple_files=True)
168
+ if files:
169
+ save_file(files)
170
+ all_texts, n_texts = load_files()
171
+ docsearch = ingest(all_texts, use_pinecone, embeddings, pinecone_index_name,
172
+ chroma_collection_name, persist_directory)
173
+ docsearch_ready = True
174
+ else:
175
+ st.write(
176
+ 'No data is to be ingested. Make sure the Pinecone index or Chroma collection name you provided contains data.')
177
+ docsearch, n_texts = setup_docsearch(use_pinecone, pinecone_index_name,
178
+ embeddings, chroma_collection_name, persist_directory)
179
+ docsearch_ready = True
180
+ if docsearch_ready:
181
+ # number of sources (split-documents when ingesting files); default is 4
182
+ k = min([20, n_texts])
183
+ retriever = setup_retriever(docsearch, k)
184
+ CRqa = ConversationalRetrievalChain.from_llm(
185
+ llm, retriever=retriever, return_source_documents=True)
186
+
187
+ st.title('Chatbot')
188
+ # Get user input
189
+ query = st.text_input('Enter your question; enter "exit" to exit')
190
+ if query:
191
+ # Generate a reply based on the user input and chat history
192
+ reply, source = get_response(query, chat_history)
193
+ print(chat_history)
194
+ # Update the chat history with the user input and system response
195
+ chat_history.append(('User', query))
196
+ chat_history.append(('Bot', reply))
197
+ chat_history_str = '\n'.join(
198
+ [f'{x[0]}: {x[1]}' for x in chat_history])
199
+ st.text_area('Chat record:', value=chat_history_str, height=250)
200
+ # Display sources
201
+ for i, source_i in enumerate(source):
202
+ if i < 2:
203
+ if len(source_i.page_content) > 400:
204
+ page_content = source_i.page_content[:400]
205
+ else:
206
+ page_content = source_i.page_content
207
+ if source_i.metadata:
208
+ metadata_source = source_i.metadata['source']
209
+ st.write(
210
+ f"**_Source {i+1}:_** {metadata_source}: {page_content}")
211
+ st.write(source_i.metadata)
212
+ else:
213
+ st.write(f"**_Source {i+1}:_** {page_content}")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ streamlit
4
+ pinecone-client
5
+ chromadb
tmp_docs/empty.txt ADDED
File without changes