xangma
commited on
Commit
•
df62f91
1
Parent(s):
0f7b25d
latest
Browse files
.gitignore
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
.chroma/*
|
2 |
-
.persisted_data
|
3 |
downloaded/*
|
4 |
__pycache__/*
|
5 |
launch.json
|
|
|
1 |
.chroma/*
|
2 |
+
.persisted_data*
|
3 |
downloaded/*
|
4 |
__pycache__/*
|
5 |
launch.json
|
app.py
CHANGED
@@ -6,6 +6,8 @@ import random
|
|
6 |
import shutil
|
7 |
import string
|
8 |
import sys
|
|
|
|
|
9 |
|
10 |
import chromadb
|
11 |
import gradio as gr
|
@@ -13,7 +15,7 @@ from chromadb.config import Settings
|
|
13 |
from langchain.docstore.document import Document
|
14 |
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
|
15 |
from langchain.vectorstores import Chroma
|
16 |
-
|
17 |
from chain import get_new_chain1
|
18 |
from ingest import embedding_chooser, ingest_docs
|
19 |
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
@@ -36,105 +38,138 @@ def toggle_log_textbox(log_textbox_state):
|
|
36 |
def update_textbox(full_log):
|
37 |
return gr.update(value=full_log)
|
38 |
|
39 |
-
def
|
40 |
-
|
41 |
-
return ''.join(random.choice(letters) for i in range(length))
|
42 |
|
43 |
def change_tab():
|
44 |
return gr.Tabs.update(selected=0)
|
45 |
|
46 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
if type(embedding_radio) == gr.Radio:
|
48 |
embedding_radio = embedding_radio.value
|
49 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
|
|
50 |
embedding_function = embedding_chooser(embedding_radio)
|
51 |
merged_documents = []
|
52 |
merged_embeddings = []
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
return merged_vectorstore
|
69 |
|
70 |
-
def set_chain_up(openai_api_key, model_selector, k_textbox, max_tokens_textbox, vectorstore, agent):
|
71 |
if not agent or type(agent) == str:
|
72 |
if vectorstore != None:
|
73 |
if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
|
74 |
if openai_api_key:
|
75 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
76 |
-
qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox)
|
77 |
os.environ["OPENAI_API_KEY"] = ""
|
78 |
return qa_chain
|
79 |
else:
|
80 |
return 'no_open_aikey'
|
81 |
else:
|
82 |
-
qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox)
|
83 |
return qa_chain
|
84 |
else:
|
85 |
return 'no_vectorstore'
|
86 |
else:
|
87 |
return agent
|
88 |
|
89 |
-
def delete_collection(all_collections_state, collections_viewer, embedding_radio):
|
90 |
if type(embedding_radio) == gr.Radio:
|
91 |
embedding_radio = embedding_radio.value
|
92 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
return all_collections_state, collections_viewer
|
106 |
|
107 |
-
def delete_all_collections(all_collections_state, embedding_radio):
|
108 |
if type(embedding_radio) == gr.Radio:
|
109 |
embedding_radio = embedding_radio.value
|
110 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
111 |
-
|
|
|
|
|
|
|
|
|
112 |
return []
|
113 |
|
114 |
-
def list_collections(all_collections_state, embedding_radio):
|
115 |
if type(embedding_radio) == gr.Radio:
|
116 |
embedding_radio = embedding_radio.value
|
117 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
return gr.Textbox.update(value=full_log)
|
131 |
-
|
132 |
-
def destroy_state(state):
|
133 |
-
state = None
|
134 |
-
return state
|
135 |
-
|
136 |
-
def clear_chat(chatbot, history):
|
137 |
-
return [], []
|
138 |
|
139 |
def chat(inp, history, agent):
|
140 |
history = history or []
|
@@ -181,6 +216,12 @@ with block:
|
|
181 |
lines=1,
|
182 |
value="20",
|
183 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
max_tokens_textbox = gr.Textbox(
|
185 |
placeholder="max_tokens: Maximum number of tokens to generate",
|
186 |
label="max_tokens",
|
@@ -201,6 +242,7 @@ with block:
|
|
201 |
examples=[
|
202 |
"What does this code do?",
|
203 |
"I want to change the chat-pykg app to have a log viewer, where the user can see what python is doing in the background. How could I do that?",
|
|
|
204 |
],
|
205 |
inputs=message,
|
206 |
)
|
@@ -219,6 +261,19 @@ with block:
|
|
219 |
get_all_collection_names_button = gr.Button(value="List all saved repositories", variant="secondary")#.style(full_width=False)
|
220 |
delete_collections_button = gr.Button(value="Delete selected saved repositories", variant="secondary")#.style(full_width=False)
|
221 |
delete_all_collections_button = gr.Button(value="Delete all saved repositories", variant="secondary")#.style(full_width=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
222 |
with gr.TabItem("Get New Repositories", id=2):
|
223 |
with gr.Row():
|
224 |
all_collections_to_get = gr.List(headers=['Repository URL', 'Folders'], row_count=3, col_count=2, label='Repositories to get', show_label=True, interactive=True, max_cols=2, max_rows=3)
|
@@ -229,26 +284,30 @@ with block:
|
|
229 |
label="Chunk size",
|
230 |
show_label=True,
|
231 |
lines=1,
|
232 |
-
value="
|
233 |
)
|
234 |
chunk_overlap_textbox = gr.Textbox(
|
235 |
placeholder="Chunk overlap",
|
236 |
label="Chunk overlap",
|
237 |
show_label=True,
|
238 |
lines=1,
|
239 |
-
value="
|
240 |
)
|
241 |
-
|
242 |
choices = ['Sentence Transformers', 'OpenAI'],
|
243 |
label="Embedding Options",
|
244 |
show_label=True,
|
245 |
value='Sentence Transformers'
|
246 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
with gr.Row():
|
248 |
gr.HTML('<center>See the <a href=https://python.langchain.com/en/latest/reference/modules/text_splitter.html>Langchain textsplitter docs</a></center>')
|
249 |
-
gr.HTML(
|
250 |
-
"<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
|
251 |
-
)
|
252 |
|
253 |
history_state = gr.State()
|
254 |
agent_state = gr.State()
|
@@ -257,18 +316,25 @@ with block:
|
|
257 |
chat_state = gr.State()
|
258 |
debug_state = gr.State()
|
259 |
debug_state.value = False
|
|
|
260 |
|
261 |
-
submit.click(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
|
262 |
-
message.submit(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
|
263 |
|
264 |
-
load_collections_button.click(merge_collections, inputs=[collections_viewer, vs_state,
|
265 |
-
make_collections_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get, chunk_size_textbox, chunk_overlap_textbox,
|
266 |
-
delete_collections_button.click(delete_collection, inputs=[all_collections_state, collections_viewer,
|
267 |
-
delete_all_collections_button.click(delete_all_collections, inputs=[all_collections_state,
|
268 |
-
get_all_collection_names_button.click(list_collections, inputs=[all_collections_state,
|
269 |
clear_btn.click(clear_chat, inputs = [chatbot, history_state], outputs = [chatbot, history_state])
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
# Whenever chain parameters change, destroy the agent.
|
271 |
-
input_list = [openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox,
|
272 |
output_list = [agent_state]
|
273 |
for input_item in input_list:
|
274 |
input_item.change(
|
@@ -276,7 +342,7 @@ with block:
|
|
276 |
inputs=output_list,
|
277 |
outputs=output_list,
|
278 |
)
|
279 |
-
all_collections_state.value = list_collections(all_collections_state,
|
280 |
block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
|
281 |
log_textbox_handler = LogTextboxHandler(gr.TextArea(interactive=False, placeholder="Logs will appear here...", visible=False))
|
282 |
log_textbox = log_textbox_handler.textbox
|
@@ -285,5 +351,9 @@ with block:
|
|
285 |
log_textbox_visibility_state.value = False
|
286 |
log_toggle_button = gr.Button("Toggle Log", variant="secondary")
|
287 |
log_toggle_button.click(toggle_log_textbox, inputs=[log_textbox_visibility_state], outputs=[log_textbox_visibility_state,log_textbox])
|
|
|
|
|
|
|
|
|
288 |
block.queue(concurrency_count=40)
|
289 |
block.launch(debug=True)
|
|
|
6 |
import shutil
|
7 |
import string
|
8 |
import sys
|
9 |
+
from pathlib import Path
|
10 |
+
import numpy as np
|
11 |
|
12 |
import chromadb
|
13 |
import gradio as gr
|
|
|
15 |
from langchain.docstore.document import Document
|
16 |
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
|
17 |
from langchain.vectorstores import Chroma
|
18 |
+
from langchain.retrievers import SVMRetriever
|
19 |
from chain import get_new_chain1
|
20 |
from ingest import embedding_chooser, ingest_docs
|
21 |
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
|
|
38 |
def update_textbox(full_log):
|
39 |
return gr.update(value=full_log)
|
40 |
|
41 |
+
def update_radio(radio):
|
42 |
+
return gr.Radio.update(value=radio)
|
|
|
43 |
|
44 |
def change_tab():
|
45 |
return gr.Tabs.update(selected=0)
|
46 |
|
47 |
+
def update_checkboxgroup(all_collections_state):
|
48 |
+
new_options = [i for i in all_collections_state]
|
49 |
+
return gr.CheckboxGroup.update(choices=new_options)
|
50 |
+
|
51 |
+
def update_log_textbox(full_log):
|
52 |
+
return gr.Textbox.update(value=full_log)
|
53 |
+
|
54 |
+
def destroy_state(state):
|
55 |
+
state = None
|
56 |
+
return state
|
57 |
+
|
58 |
+
def clear_chat(chatbot, history):
|
59 |
+
return [], []
|
60 |
+
|
61 |
+
def merge_collections(collection_load_names, vs_state, k_textbox, search_type_selector, vectorstore_radio, embedding_radio):
|
62 |
if type(embedding_radio) == gr.Radio:
|
63 |
embedding_radio = embedding_radio.value
|
64 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
65 |
+
persist_directory_raw = Path('.persisted_data_raw')
|
66 |
embedding_function = embedding_chooser(embedding_radio)
|
67 |
merged_documents = []
|
68 |
merged_embeddings = []
|
69 |
+
merged_vectorstore = None
|
70 |
+
if vectorstore_radio == 'Chroma':
|
71 |
+
for collection_name in collection_load_names:
|
72 |
+
chroma_obj_get = chromadb.Client(Settings(
|
73 |
+
chroma_db_impl="duckdb+parquet",
|
74 |
+
persist_directory=persist_directory,
|
75 |
+
anonymized_telemetry = True
|
76 |
+
))
|
77 |
+
if collection_name == '':
|
78 |
+
continue
|
79 |
+
collection_obj = chroma_obj_get.get_collection(collection_name, embedding_function=embedding_function)
|
80 |
+
collection = collection_obj.get(include=["metadatas", "documents", "embeddings"])
|
81 |
+
for i in range(len(collection['documents'])):
|
82 |
+
merged_documents.append(Document(page_content=collection['documents'][i], metadata = collection['metadatas'][i]))
|
83 |
+
merged_embeddings.append(collection['embeddings'][i])
|
84 |
+
merged_vectorstore = Chroma(collection_name="temp", embedding_function=embedding_function)
|
85 |
+
merged_vectorstore.add_documents(documents=merged_documents, embeddings=merged_embeddings)
|
86 |
+
if vectorstore_radio == 'raw':
|
87 |
+
merged_vectorstore = []
|
88 |
+
for collection_name in collection_load_names:
|
89 |
+
if collection_name == '':
|
90 |
+
continue
|
91 |
+
collection_path = persist_directory_raw / collection_name
|
92 |
+
docarr = np.load(collection_path.as_posix() +'.npy', allow_pickle=True)
|
93 |
+
merged_vectorstore.extend(docarr.tolist())
|
94 |
+
# read every line and append to texts
|
95 |
+
# for f in os.listdir(collection_path):
|
96 |
+
# with open(os.path.join(collection_path, f), "r") as f:
|
97 |
+
# merged_vectorstore.append(f.readlines())
|
98 |
return merged_vectorstore
|
99 |
|
100 |
+
def set_chain_up(openai_api_key, model_selector, k_textbox, search_type_selector, max_tokens_textbox, vectorstore_radio, vectorstore, agent):
|
101 |
if not agent or type(agent) == str:
|
102 |
if vectorstore != None:
|
103 |
if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
|
104 |
if openai_api_key:
|
105 |
os.environ["OPENAI_API_KEY"] = openai_api_key
|
106 |
+
qa_chain = get_new_chain1(vectorstore, vectorstore_radio, model_selector, k_textbox, search_type_selector, max_tokens_textbox)
|
107 |
os.environ["OPENAI_API_KEY"] = ""
|
108 |
return qa_chain
|
109 |
else:
|
110 |
return 'no_open_aikey'
|
111 |
else:
|
112 |
+
qa_chain = get_new_chain1(vectorstore, vectorstore_radio, model_selector, k_textbox, search_type_selector, max_tokens_textbox)
|
113 |
return qa_chain
|
114 |
else:
|
115 |
return 'no_vectorstore'
|
116 |
else:
|
117 |
return agent
|
118 |
|
119 |
+
def delete_collection(all_collections_state, collections_viewer, select_vectorstore_radio, embedding_radio):
|
120 |
if type(embedding_radio) == gr.Radio:
|
121 |
embedding_radio = embedding_radio.value
|
122 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
123 |
+
persist_directory_raw = Path('.persisted_data_raw')
|
124 |
+
if select_vectorstore_radio == 'Chroma':
|
125 |
+
client = chromadb.Client(Settings(
|
126 |
+
chroma_db_impl="duckdb+parquet",
|
127 |
+
persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
|
128 |
+
))
|
129 |
+
for collection in collections_viewer:
|
130 |
+
try:
|
131 |
+
client.delete_collection(collection)
|
132 |
+
all_collections_state.remove(collection)
|
133 |
+
collections_viewer.remove(collection)
|
134 |
+
except Exception as e:
|
135 |
+
logging.error(e)
|
136 |
+
if select_vectorstore_radio == 'raw':
|
137 |
+
for collection in collections_viewer:
|
138 |
+
try:
|
139 |
+
os.remove(os.path.join(persist_directory_raw.as_posix(), collection+'.npy' ))
|
140 |
+
all_collections_state.remove(collection)
|
141 |
+
collections_viewer.remove(collection)
|
142 |
+
except Exception as e:
|
143 |
+
logging.error(e)
|
144 |
return all_collections_state, collections_viewer
|
145 |
|
146 |
+
def delete_all_collections(all_collections_state, select_vectorstore_radio, embedding_radio):
|
147 |
if type(embedding_radio) == gr.Radio:
|
148 |
embedding_radio = embedding_radio.value
|
149 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
150 |
+
persist_directory_raw = Path('.persisted_data_raw')
|
151 |
+
if select_vectorstore_radio == 'Chroma':
|
152 |
+
shutil.rmtree(persist_directory)
|
153 |
+
if select_vectorstore_radio == 'raw':
|
154 |
+
shutil.rmtree(persist_directory_raw)
|
155 |
return []
|
156 |
|
157 |
+
def list_collections(all_collections_state, select_vectorstore_radio, embedding_radio):
|
158 |
if type(embedding_radio) == gr.Radio:
|
159 |
embedding_radio = embedding_radio.value
|
160 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
161 |
+
persist_directory_raw = Path('.persisted_data_raw')
|
162 |
+
if select_vectorstore_radio == 'Chroma':
|
163 |
+
client = chromadb.Client(Settings(
|
164 |
+
chroma_db_impl="duckdb+parquet",
|
165 |
+
persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
|
166 |
+
))
|
167 |
+
collection_names = [[c.name][0] for c in client.list_collections()]
|
168 |
+
return collection_names
|
169 |
+
if select_vectorstore_radio == 'raw':
|
170 |
+
if os.path.exists(persist_directory_raw):
|
171 |
+
return [f.name.split('.npy')[0] for f in os.scandir(persist_directory_raw)]
|
172 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
def chat(inp, history, agent):
|
175 |
history = history or []
|
|
|
216 |
lines=1,
|
217 |
value="20",
|
218 |
)
|
219 |
+
search_type_selector = gr.Dropdown(
|
220 |
+
choices=["similarity", "mmr", "svm"],
|
221 |
+
label="Search Type",
|
222 |
+
show_label=True,
|
223 |
+
value = "similarity"
|
224 |
+
)
|
225 |
max_tokens_textbox = gr.Textbox(
|
226 |
placeholder="max_tokens: Maximum number of tokens to generate",
|
227 |
label="max_tokens",
|
|
|
242 |
examples=[
|
243 |
"What does this code do?",
|
244 |
"I want to change the chat-pykg app to have a log viewer, where the user can see what python is doing in the background. How could I do that?",
|
245 |
+
"Hello, I want to allow chat-pykg to search the internet before answering, can you help me change the code to do that? Thanks.",
|
246 |
],
|
247 |
inputs=message,
|
248 |
)
|
|
|
261 |
get_all_collection_names_button = gr.Button(value="List all saved repositories", variant="secondary")#.style(full_width=False)
|
262 |
delete_collections_button = gr.Button(value="Delete selected saved repositories", variant="secondary")#.style(full_width=False)
|
263 |
delete_all_collections_button = gr.Button(value="Delete all saved repositories", variant="secondary")#.style(full_width=False)
|
264 |
+
with gr.Row():
|
265 |
+
select_embedding_radio = gr.Radio(
|
266 |
+
choices = ['Sentence Transformers', 'OpenAI'],
|
267 |
+
label="Embedding Options",
|
268 |
+
show_label=True,
|
269 |
+
value='Sentence Transformers'
|
270 |
+
)
|
271 |
+
select_vectorstore_radio = gr.Radio(
|
272 |
+
choices = ['Chroma', 'raw'],
|
273 |
+
label="Vectorstore Options",
|
274 |
+
show_label=True,
|
275 |
+
value='Chroma'
|
276 |
+
)
|
277 |
with gr.TabItem("Get New Repositories", id=2):
|
278 |
with gr.Row():
|
279 |
all_collections_to_get = gr.List(headers=['Repository URL', 'Folders'], row_count=3, col_count=2, label='Repositories to get', show_label=True, interactive=True, max_cols=2, max_rows=3)
|
|
|
284 |
label="Chunk size",
|
285 |
show_label=True,
|
286 |
lines=1,
|
287 |
+
value="2000"
|
288 |
)
|
289 |
chunk_overlap_textbox = gr.Textbox(
|
290 |
placeholder="Chunk overlap",
|
291 |
label="Chunk overlap",
|
292 |
show_label=True,
|
293 |
lines=1,
|
294 |
+
value="200"
|
295 |
)
|
296 |
+
make_embedding_radio = gr.Radio(
|
297 |
choices = ['Sentence Transformers', 'OpenAI'],
|
298 |
label="Embedding Options",
|
299 |
show_label=True,
|
300 |
value='Sentence Transformers'
|
301 |
)
|
302 |
+
make_vectorstore_radio = gr.Radio(
|
303 |
+
choices = ['Chroma', 'raw'],
|
304 |
+
label="Vectorstore Options",
|
305 |
+
show_label=True,
|
306 |
+
value='Chroma'
|
307 |
+
)
|
308 |
+
|
309 |
with gr.Row():
|
310 |
gr.HTML('<center>See the <a href=https://python.langchain.com/en/latest/reference/modules/text_splitter.html>Langchain textsplitter docs</a></center>')
|
|
|
|
|
|
|
311 |
|
312 |
history_state = gr.State()
|
313 |
agent_state = gr.State()
|
|
|
316 |
chat_state = gr.State()
|
317 |
debug_state = gr.State()
|
318 |
debug_state.value = False
|
319 |
+
radio_state = gr.State()
|
320 |
|
321 |
+
submit.click(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, search_type_selector, max_tokens_textbox, select_vectorstore_radio, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
|
322 |
+
message.submit(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, search_type_selector, max_tokens_textbox, select_vectorstore_radio, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
|
323 |
|
324 |
+
load_collections_button.click(merge_collections, inputs=[collections_viewer, vs_state, k_textbox, search_type_selector, select_vectorstore_radio, select_embedding_radio], outputs=[vs_state])#.then(change_tab, None, tabs) #.then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state])
|
325 |
+
make_collections_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get, chunk_size_textbox, chunk_overlap_textbox, select_vectorstore_radio, select_embedding_radio, debug_state], outputs=[all_collections_state, all_collections_to_get], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
|
326 |
+
delete_collections_button.click(delete_collection, inputs=[all_collections_state, collections_viewer, select_vectorstore_radio, select_embedding_radio], outputs=[all_collections_state, collections_viewer]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
|
327 |
+
delete_all_collections_button.click(delete_all_collections, inputs=[all_collections_state,select_vectorstore_radio, select_embedding_radio], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
|
328 |
+
get_all_collection_names_button.click(list_collections, inputs=[all_collections_state,select_vectorstore_radio, select_embedding_radio], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
|
329 |
clear_btn.click(clear_chat, inputs = [chatbot, history_state], outputs = [chatbot, history_state])
|
330 |
+
|
331 |
+
make_embedding_radio.change(update_radio, inputs = make_embedding_radio, outputs = select_embedding_radio)
|
332 |
+
select_embedding_radio.change(update_radio, inputs = select_embedding_radio, outputs = make_embedding_radio)
|
333 |
+
make_vectorstore_radio.change(update_radio, inputs =make_vectorstore_radio, outputs = select_vectorstore_radio)
|
334 |
+
select_vectorstore_radio.change(update_radio, inputs = select_vectorstore_radio, outputs = make_vectorstore_radio)
|
335 |
+
|
336 |
# Whenever chain parameters change, destroy the agent.
|
337 |
+
input_list = [openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, select_vectorstore_radio, make_embedding_radio]
|
338 |
output_list = [agent_state]
|
339 |
for input_item in input_list:
|
340 |
input_item.change(
|
|
|
342 |
inputs=output_list,
|
343 |
outputs=output_list,
|
344 |
)
|
345 |
+
all_collections_state.value = list_collections(all_collections_state, select_vectorstore_radio, select_embedding_radio)
|
346 |
block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
|
347 |
log_textbox_handler = LogTextboxHandler(gr.TextArea(interactive=False, placeholder="Logs will appear here...", visible=False))
|
348 |
log_textbox = log_textbox_handler.textbox
|
|
|
351 |
log_textbox_visibility_state.value = False
|
352 |
log_toggle_button = gr.Button("Toggle Log", variant="secondary")
|
353 |
log_toggle_button.click(toggle_log_textbox, inputs=[log_textbox_visibility_state], outputs=[log_textbox_visibility_state,log_textbox])
|
354 |
+
|
355 |
+
gr.HTML(
|
356 |
+
"<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
|
357 |
+
)
|
358 |
block.queue(concurrency_count=40)
|
359 |
block.launch(debug=True)
|
chain.py
CHANGED
@@ -17,20 +17,20 @@ from langchain.schema import BaseLanguageModel, BaseRetriever, Document
|
|
17 |
from langchain.prompts.prompt import PromptTemplate
|
18 |
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
28 |
|
29 |
template = """You are called chat-pykg and are an AI assistant coded in python using langchain and gradio. You are very helpful for answering questions about various open source libraries.
|
30 |
You are given the following extracted parts of code and a question. Provide a conversational answer to the question.
|
31 |
Do NOT make up any hyperlinks that are not in the code.
|
32 |
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
33 |
-
If the question is not about the package documentation, politely inform them that you are tuned to only answer questions about the package documentations.
|
34 |
Question: {question}
|
35 |
=========
|
36 |
{context}
|
@@ -48,13 +48,9 @@ def get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox) -
|
|
48 |
|
49 |
# memory = ConversationKGMemory(llm=llm, input_key="question", output_key="answer")
|
50 |
memory = ConversationBufferWindowMemory(input_key="question", output_key="answer", k=5)
|
51 |
-
|
52 |
-
if len(k_textbox) != 0:
|
53 |
-
retriever.search_kwargs = {"k": int(k_textbox)}
|
54 |
-
else:
|
55 |
-
retriever.search_kwargs = {"k": 10}
|
56 |
qa = ConversationalRetrievalChain(
|
57 |
-
retriever=retriever, memory=memory, combine_docs_chain=doc_chain, question_generator=question_generator)
|
58 |
# qa._get_docs = _get_docs.__get__(qa, ConversationalRetrievalChain)
|
59 |
|
60 |
return qa
|
|
|
17 |
from langchain.prompts.prompt import PromptTemplate
|
18 |
|
19 |
|
20 |
+
def get_new_chain1(vectorstore, vectorstore_radio, model_selector, k_textbox, search_type_selector, max_tokens_textbox) -> Chain:
|
21 |
+
retriever = None
|
22 |
+
if vectorstore_radio == 'Chroma':
|
23 |
+
retriever = vectorstore.as_retriever(search_type=search_type_selector)
|
24 |
+
retriever.search_kwargs = {"k":int(k_textbox)}
|
25 |
+
if vectorstore_radio == 'raw':
|
26 |
+
if search_type_selector == 'svm':
|
27 |
+
retriever = SVMRetriever.from_texts(merged_vectorstore, embedding_function)
|
28 |
+
retriever.k = int(k_textbox)
|
29 |
|
30 |
template = """You are called chat-pykg and are an AI assistant coded in python using langchain and gradio. You are very helpful for answering questions about various open source libraries.
|
31 |
You are given the following extracted parts of code and a question. Provide a conversational answer to the question.
|
32 |
Do NOT make up any hyperlinks that are not in the code.
|
33 |
If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
|
|
34 |
Question: {question}
|
35 |
=========
|
36 |
{context}
|
|
|
48 |
|
49 |
# memory = ConversationKGMemory(llm=llm, input_key="question", output_key="answer")
|
50 |
memory = ConversationBufferWindowMemory(input_key="question", output_key="answer", k=5)
|
51 |
+
|
|
|
|
|
|
|
|
|
52 |
qa = ConversationalRetrievalChain(
|
53 |
+
retriever=retriever, memory=memory, combine_docs_chain=doc_chain, question_generator=question_generator, verbose=True, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
|
54 |
# qa._get_docs = _get_docs.__get__(qa, ConversationalRetrievalChain)
|
55 |
|
56 |
return qa
|
ingest.py
CHANGED
@@ -17,58 +17,7 @@ from pydantic import Extra, Field, root_validator
|
|
17 |
import logging
|
18 |
logger = logging.getLogger()
|
19 |
from langchain.docstore.document import Document
|
20 |
-
|
21 |
-
# class CachedChroma(Chroma, ABC):
|
22 |
-
# """
|
23 |
-
# Wrapper around Chroma to make caching embeddings easier.
|
24 |
-
|
25 |
-
# It automatically uses a cached version of a specified collection, if available.
|
26 |
-
# Example:
|
27 |
-
# .. code-block:: python
|
28 |
-
# from langchain.vectorstores import Chroma
|
29 |
-
# from langchain.embeddings.openai import OpenAIEmbeddings
|
30 |
-
# embeddings = OpenAIEmbeddings()
|
31 |
-
# vectorstore = CachedChroma.from_documents_with_cache(
|
32 |
-
# ".persisted_data", texts, embeddings, collection_name="fun_experiment"
|
33 |
-
# )
|
34 |
-
# """
|
35 |
-
|
36 |
-
# @classmethod
|
37 |
-
# def from_documents_with_cache(
|
38 |
-
# cls,
|
39 |
-
# persist_directory: str,
|
40 |
-
# documents: Optional[List[Document]] = None,
|
41 |
-
# embedding: Optional[Embeddings] = None,
|
42 |
-
# ids: Optional[List[str]] = None,
|
43 |
-
# collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
44 |
-
# client_settings: Optional[chromadb.config.Settings] = None,
|
45 |
-
# **kwargs: Any,
|
46 |
-
# ) -> Chroma:
|
47 |
-
# client_settings = Settings(
|
48 |
-
# chroma_db_impl="duckdb+parquet",
|
49 |
-
# persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
|
50 |
-
# )
|
51 |
-
# client = chromadb.Client(client_settings)
|
52 |
-
# collection_names = [c.name for c in client.list_collections()]
|
53 |
-
|
54 |
-
# if collection_name in collection_names:
|
55 |
-
# return Chroma(
|
56 |
-
# collection_name=collection_name,
|
57 |
-
# embedding_function=embedding,
|
58 |
-
# persist_directory=persist_directory,
|
59 |
-
# client_settings=client_settings,
|
60 |
-
# )
|
61 |
-
# if documents:
|
62 |
-
# return Chroma.from_documents(
|
63 |
-
# documents=documents,
|
64 |
-
# embedding=embedding,
|
65 |
-
# ids=ids,
|
66 |
-
# collection_name=collection_name,
|
67 |
-
# persist_directory=persist_directory,
|
68 |
-
# client_settings=client_settings,
|
69 |
-
# **kwargs
|
70 |
-
# )
|
71 |
-
# raise ValueError("Either documents or collection_name must be specified.")
|
72 |
|
73 |
def embedding_chooser(embedding_radio):
|
74 |
if embedding_radio == "Sentence Transformers":
|
@@ -133,7 +82,7 @@ def get_text(content):
|
|
133 |
else:
|
134 |
return ""
|
135 |
|
136 |
-
def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, embedding_radio, debug=False):
|
137 |
cleared_list = urls.copy()
|
138 |
def sanitize_folder_name(folder_name):
|
139 |
if folder_name != '':
|
@@ -164,6 +113,7 @@ def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, embeddin
|
|
164 |
if orgrepo.replace('/','-') in all_collections_state:
|
165 |
logging.info(f"Skipping {orgrepo} as it is already in the database")
|
166 |
continue
|
|
|
167 |
documents = []
|
168 |
paths = []
|
169 |
paths_by_ext = {}
|
@@ -227,21 +177,47 @@ def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, embeddin
|
|
227 |
continue
|
228 |
for ext in docs_by_ext.keys():
|
229 |
if ext == "py":
|
230 |
-
|
|
|
231 |
if ext == "md":
|
232 |
-
|
|
|
233 |
# else:
|
234 |
# documents += text_splitter.split_documents(docs_by_ext[ext]
|
235 |
-
all_docs +=
|
236 |
# For each document, add the metadata to the page_content
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
for doc in documents:
|
|
|
|
|
|
|
|
|
238 |
doc.page_content = f'# source:{doc.metadata["source"]}\n{doc.page_content}'
|
|
|
239 |
if type(embedding_radio) == gr.Radio:
|
240 |
embedding_radio = embedding_radio.value
|
241 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
|
|
|
|
242 |
collection_name = orgrepo.replace('/','-')
|
243 |
-
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
all_collections_state.append(collection_name)
|
246 |
cleared_list[j][0], cleared_list[j][1] = '', ''
|
247 |
return all_collections_state, gr.update(value=cleared_list)
|
|
|
17 |
import logging
|
18 |
logger = logging.getLogger()
|
19 |
from langchain.docstore.document import Document
|
20 |
+
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def embedding_chooser(embedding_radio):
|
23 |
if embedding_radio == "Sentence Transformers":
|
|
|
82 |
else:
|
83 |
return ""
|
84 |
|
85 |
+
def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, vectorstore_radio, embedding_radio, debug=False):
|
86 |
cleared_list = urls.copy()
|
87 |
def sanitize_folder_name(folder_name):
|
88 |
if folder_name != '':
|
|
|
113 |
if orgrepo.replace('/','-') in all_collections_state:
|
114 |
logging.info(f"Skipping {orgrepo} as it is already in the database")
|
115 |
continue
|
116 |
+
documents_split = []
|
117 |
documents = []
|
118 |
paths = []
|
119 |
paths_by_ext = {}
|
|
|
177 |
continue
|
178 |
for ext in docs_by_ext.keys():
|
179 |
if ext == "py":
|
180 |
+
documents_split += py_splitter.split_documents(docs_by_ext[ext])
|
181 |
+
documents += docs_by_ext[ext]
|
182 |
if ext == "md":
|
183 |
+
documents_split += md_splitter.split_documents(docs_by_ext[ext])
|
184 |
+
documents += docs_by_ext[ext]
|
185 |
# else:
|
186 |
# documents += text_splitter.split_documents(docs_by_ext[ext]
|
187 |
+
all_docs += documents_split
|
188 |
# For each document, add the metadata to the page_content
|
189 |
+
for doc in documents_split:
|
190 |
+
if local_repo_path != '.':
|
191 |
+
doc.metadata["source"] = doc.metadata["source"].replace(local_repo_path, "")
|
192 |
+
if doc.metadata["source"] == '/':
|
193 |
+
doc.metadata["source"] = doc.metadata["source"][1:]
|
194 |
+
doc.page_content = f'# source:{doc.metadata["source"]}\n{doc.page_content}'
|
195 |
for doc in documents:
|
196 |
+
if local_repo_path != '.':
|
197 |
+
doc.metadata["source"] = doc.metadata["source"].replace(local_repo_path, "")
|
198 |
+
if doc.metadata["source"] == '/':
|
199 |
+
doc.metadata["source"] = doc.metadata["source"][1:]
|
200 |
doc.page_content = f'# source:{doc.metadata["source"]}\n{doc.page_content}'
|
201 |
+
|
202 |
if type(embedding_radio) == gr.Radio:
|
203 |
embedding_radio = embedding_radio.value
|
204 |
persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
|
205 |
+
persist_directory_raw = Path('.persisted_data_raw')
|
206 |
+
persist_directory_raw.mkdir(parents=True, exist_ok=True)
|
207 |
collection_name = orgrepo.replace('/','-')
|
208 |
+
|
209 |
+
if vectorstore_radio == 'Chroma':
|
210 |
+
collection = Chroma.from_documents(documents=documents_split, collection_name=collection_name, embedding=embedding_function, persist_directory=persist_directory)
|
211 |
+
collection.persist()
|
212 |
+
|
213 |
+
if vectorstore_radio == 'raw':
|
214 |
+
# Persist the raw documents
|
215 |
+
docarr = np.array([doc.page_content for doc in documents_split])
|
216 |
+
np.save(os.path.join(persist_directory_raw, f"{collection_name}.npy"), docarr)
|
217 |
+
# with open(os.path.join(persist_directory_raw, f"{collection_name}"), "w") as f:
|
218 |
+
# for doc in documents:
|
219 |
+
# f.write(doc.page_content)
|
220 |
+
|
221 |
all_collections_state.append(collection_name)
|
222 |
cleared_list[j][0], cleared_list[j][1] = '', ''
|
223 |
return all_collections_state, gr.update(value=cleared_list)
|