xangma commited on
Commit
df62f91
β€’
1 Parent(s): 0f7b25d
Files changed (4) hide show
  1. .gitignore +1 -1
  2. app.py +144 -74
  3. chain.py +11 -15
  4. ingest.py +34 -58
.gitignore CHANGED
@@ -1,5 +1,5 @@
1
  .chroma/*
2
- .persisted_data/*
3
  downloaded/*
4
  __pycache__/*
5
  launch.json
 
1
  .chroma/*
2
+ .persisted_data*
3
  downloaded/*
4
  __pycache__/*
5
  launch.json
app.py CHANGED
@@ -6,6 +6,8 @@ import random
6
  import shutil
7
  import string
8
  import sys
 
 
9
 
10
  import chromadb
11
  import gradio as gr
@@ -13,7 +15,7 @@ from chromadb.config import Settings
13
  from langchain.docstore.document import Document
14
  from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
15
  from langchain.vectorstores import Chroma
16
-
17
  from chain import get_new_chain1
18
  from ingest import embedding_chooser, ingest_docs
19
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@@ -36,105 +38,138 @@ def toggle_log_textbox(log_textbox_state):
36
  def update_textbox(full_log):
37
  return gr.update(value=full_log)
38
 
39
- def randomword(length):
40
- letters = string.ascii_lowercase
41
- return ''.join(random.choice(letters) for i in range(length))
42
 
43
  def change_tab():
44
  return gr.Tabs.update(selected=0)
45
 
46
- def merge_collections(collection_load_names, vs_state, embedding_radio):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  if type(embedding_radio) == gr.Radio:
48
  embedding_radio = embedding_radio.value
49
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
 
50
  embedding_function = embedding_chooser(embedding_radio)
51
  merged_documents = []
52
  merged_embeddings = []
53
- for collection_name in collection_load_names:
54
- chroma_obj_get = chromadb.Client(Settings(
55
- chroma_db_impl="duckdb+parquet",
56
- persist_directory=persist_directory,
57
- anonymized_telemetry = True
58
- ))
59
- if collection_name == '':
60
- continue
61
- collection_obj = chroma_obj_get.get_collection(collection_name, embedding_function=embedding_function)
62
- collection = collection_obj.get(include=["metadatas", "documents", "embeddings"])
63
- for i in range(len(collection['documents'])):
64
- merged_documents.append(Document(page_content=collection['documents'][i], metadata = collection['metadatas'][i]))
65
- merged_embeddings.append(collection['embeddings'][i])
66
- merged_vectorstore = Chroma(collection_name="temp", embedding_function=embedding_function)
67
- merged_vectorstore.add_documents(documents=merged_documents, embeddings=merged_embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  return merged_vectorstore
69
 
70
- def set_chain_up(openai_api_key, model_selector, k_textbox, max_tokens_textbox, vectorstore, agent):
71
  if not agent or type(agent) == str:
72
  if vectorstore != None:
73
  if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
74
  if openai_api_key:
75
  os.environ["OPENAI_API_KEY"] = openai_api_key
76
- qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox)
77
  os.environ["OPENAI_API_KEY"] = ""
78
  return qa_chain
79
  else:
80
  return 'no_open_aikey'
81
  else:
82
- qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox)
83
  return qa_chain
84
  else:
85
  return 'no_vectorstore'
86
  else:
87
  return agent
88
 
89
- def delete_collection(all_collections_state, collections_viewer, embedding_radio):
90
  if type(embedding_radio) == gr.Radio:
91
  embedding_radio = embedding_radio.value
92
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
93
- client = chromadb.Client(Settings(
94
- chroma_db_impl="duckdb+parquet",
95
- persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
96
- ))
97
- for collection in collections_viewer:
98
- try:
99
- client.delete_collection(collection)
100
- all_collections_state.remove(collection)
101
- collections_viewer.remove(collection)
102
- except Exception as e:
103
- logging.error(e)
104
-
 
 
 
 
 
 
 
 
 
105
  return all_collections_state, collections_viewer
106
 
107
- def delete_all_collections(all_collections_state, embedding_radio):
108
  if type(embedding_radio) == gr.Radio:
109
  embedding_radio = embedding_radio.value
110
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
111
- shutil.rmtree(persist_directory)
 
 
 
 
112
  return []
113
 
114
- def list_collections(all_collections_state, embedding_radio):
115
  if type(embedding_radio) == gr.Radio:
116
  embedding_radio = embedding_radio.value
117
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
118
- client = chromadb.Client(Settings(
119
- chroma_db_impl="duckdb+parquet",
120
- persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
121
- ))
122
- collection_names = [[c.name][0] for c in client.list_collections()]
123
- return collection_names
124
-
125
- def update_checkboxgroup(all_collections_state):
126
- new_options = [i for i in all_collections_state]
127
- return gr.CheckboxGroup.update(choices=new_options)
128
-
129
- def update_log_textbox(full_log):
130
- return gr.Textbox.update(value=full_log)
131
-
132
- def destroy_state(state):
133
- state = None
134
- return state
135
-
136
- def clear_chat(chatbot, history):
137
- return [], []
138
 
139
  def chat(inp, history, agent):
140
  history = history or []
@@ -181,6 +216,12 @@ with block:
181
  lines=1,
182
  value="20",
183
  )
 
 
 
 
 
 
184
  max_tokens_textbox = gr.Textbox(
185
  placeholder="max_tokens: Maximum number of tokens to generate",
186
  label="max_tokens",
@@ -201,6 +242,7 @@ with block:
201
  examples=[
202
  "What does this code do?",
203
  "I want to change the chat-pykg app to have a log viewer, where the user can see what python is doing in the background. How could I do that?",
 
204
  ],
205
  inputs=message,
206
  )
@@ -219,6 +261,19 @@ with block:
219
  get_all_collection_names_button = gr.Button(value="List all saved repositories", variant="secondary")#.style(full_width=False)
220
  delete_collections_button = gr.Button(value="Delete selected saved repositories", variant="secondary")#.style(full_width=False)
221
  delete_all_collections_button = gr.Button(value="Delete all saved repositories", variant="secondary")#.style(full_width=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  with gr.TabItem("Get New Repositories", id=2):
223
  with gr.Row():
224
  all_collections_to_get = gr.List(headers=['Repository URL', 'Folders'], row_count=3, col_count=2, label='Repositories to get', show_label=True, interactive=True, max_cols=2, max_rows=3)
@@ -229,26 +284,30 @@ with block:
229
  label="Chunk size",
230
  show_label=True,
231
  lines=1,
232
- value="1000"
233
  )
234
  chunk_overlap_textbox = gr.Textbox(
235
  placeholder="Chunk overlap",
236
  label="Chunk overlap",
237
  show_label=True,
238
  lines=1,
239
- value="0"
240
  )
241
- embedding_radio = gr.Radio(
242
  choices = ['Sentence Transformers', 'OpenAI'],
243
  label="Embedding Options",
244
  show_label=True,
245
  value='Sentence Transformers'
246
  )
 
 
 
 
 
 
 
247
  with gr.Row():
248
  gr.HTML('<center>See the <a href=https://python.langchain.com/en/latest/reference/modules/text_splitter.html>Langchain textsplitter docs</a></center>')
249
- gr.HTML(
250
- "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain πŸ¦œοΈπŸ”—</a></center>"
251
- )
252
 
253
  history_state = gr.State()
254
  agent_state = gr.State()
@@ -257,18 +316,25 @@ with block:
257
  chat_state = gr.State()
258
  debug_state = gr.State()
259
  debug_state.value = False
 
260
 
261
- submit.click(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
262
- message.submit(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
263
 
264
- load_collections_button.click(merge_collections, inputs=[collections_viewer, vs_state, embedding_radio], outputs=[vs_state])#.then(change_tab, None, tabs) #.then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state])
265
- make_collections_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get, chunk_size_textbox, chunk_overlap_textbox, embedding_radio, debug_state], outputs=[all_collections_state, all_collections_to_get], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
266
- delete_collections_button.click(delete_collection, inputs=[all_collections_state, collections_viewer, embedding_radio], outputs=[all_collections_state, collections_viewer]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
267
- delete_all_collections_button.click(delete_all_collections, inputs=[all_collections_state, embedding_radio], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
268
- get_all_collection_names_button.click(list_collections, inputs=[all_collections_state, embedding_radio], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
269
  clear_btn.click(clear_chat, inputs = [chatbot, history_state], outputs = [chatbot, history_state])
 
 
 
 
 
 
270
  # Whenever chain parameters change, destroy the agent.
271
- input_list = [openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, embedding_radio]
272
  output_list = [agent_state]
273
  for input_item in input_list:
274
  input_item.change(
@@ -276,7 +342,7 @@ with block:
276
  inputs=output_list,
277
  outputs=output_list,
278
  )
279
- all_collections_state.value = list_collections(all_collections_state, embedding_radio)
280
  block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
281
  log_textbox_handler = LogTextboxHandler(gr.TextArea(interactive=False, placeholder="Logs will appear here...", visible=False))
282
  log_textbox = log_textbox_handler.textbox
@@ -285,5 +351,9 @@ with block:
285
  log_textbox_visibility_state.value = False
286
  log_toggle_button = gr.Button("Toggle Log", variant="secondary")
287
  log_toggle_button.click(toggle_log_textbox, inputs=[log_textbox_visibility_state], outputs=[log_textbox_visibility_state,log_textbox])
 
 
 
 
288
  block.queue(concurrency_count=40)
289
  block.launch(debug=True)
 
6
  import shutil
7
  import string
8
  import sys
9
+ from pathlib import Path
10
+ import numpy as np
11
 
12
  import chromadb
13
  import gradio as gr
 
15
  from langchain.docstore.document import Document
16
  from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
17
  from langchain.vectorstores import Chroma
18
+ from langchain.retrievers import SVMRetriever
19
  from chain import get_new_chain1
20
  from ingest import embedding_chooser, ingest_docs
21
  logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
38
  def update_textbox(full_log):
39
  return gr.update(value=full_log)
40
 
41
+ def update_radio(radio):
42
+ return gr.Radio.update(value=radio)
 
43
 
44
  def change_tab():
45
  return gr.Tabs.update(selected=0)
46
 
47
+ def update_checkboxgroup(all_collections_state):
48
+ new_options = [i for i in all_collections_state]
49
+ return gr.CheckboxGroup.update(choices=new_options)
50
+
51
+ def update_log_textbox(full_log):
52
+ return gr.Textbox.update(value=full_log)
53
+
54
+ def destroy_state(state):
55
+ state = None
56
+ return state
57
+
58
+ def clear_chat(chatbot, history):
59
+ return [], []
60
+
61
+ def merge_collections(collection_load_names, vs_state, k_textbox, search_type_selector, vectorstore_radio, embedding_radio):
62
  if type(embedding_radio) == gr.Radio:
63
  embedding_radio = embedding_radio.value
64
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
65
+ persist_directory_raw = Path('.persisted_data_raw')
66
  embedding_function = embedding_chooser(embedding_radio)
67
  merged_documents = []
68
  merged_embeddings = []
69
+ merged_vectorstore = None
70
+ if vectorstore_radio == 'Chroma':
71
+ for collection_name in collection_load_names:
72
+ chroma_obj_get = chromadb.Client(Settings(
73
+ chroma_db_impl="duckdb+parquet",
74
+ persist_directory=persist_directory,
75
+ anonymized_telemetry = True
76
+ ))
77
+ if collection_name == '':
78
+ continue
79
+ collection_obj = chroma_obj_get.get_collection(collection_name, embedding_function=embedding_function)
80
+ collection = collection_obj.get(include=["metadatas", "documents", "embeddings"])
81
+ for i in range(len(collection['documents'])):
82
+ merged_documents.append(Document(page_content=collection['documents'][i], metadata = collection['metadatas'][i]))
83
+ merged_embeddings.append(collection['embeddings'][i])
84
+ merged_vectorstore = Chroma(collection_name="temp", embedding_function=embedding_function)
85
+ merged_vectorstore.add_documents(documents=merged_documents, embeddings=merged_embeddings)
86
+ if vectorstore_radio == 'raw':
87
+ merged_vectorstore = []
88
+ for collection_name in collection_load_names:
89
+ if collection_name == '':
90
+ continue
91
+ collection_path = persist_directory_raw / collection_name
92
+ docarr = np.load(collection_path.as_posix() +'.npy', allow_pickle=True)
93
+ merged_vectorstore.extend(docarr.tolist())
94
+ # read every line and append to texts
95
+ # for f in os.listdir(collection_path):
96
+ # with open(os.path.join(collection_path, f), "r") as f:
97
+ # merged_vectorstore.append(f.readlines())
98
  return merged_vectorstore
99
 
100
+ def set_chain_up(openai_api_key, model_selector, k_textbox, search_type_selector, max_tokens_textbox, vectorstore_radio, vectorstore, agent):
101
  if not agent or type(agent) == str:
102
  if vectorstore != None:
103
  if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
104
  if openai_api_key:
105
  os.environ["OPENAI_API_KEY"] = openai_api_key
106
+ qa_chain = get_new_chain1(vectorstore, vectorstore_radio, model_selector, k_textbox, search_type_selector, max_tokens_textbox)
107
  os.environ["OPENAI_API_KEY"] = ""
108
  return qa_chain
109
  else:
110
  return 'no_open_aikey'
111
  else:
112
+ qa_chain = get_new_chain1(vectorstore, vectorstore_radio, model_selector, k_textbox, search_type_selector, max_tokens_textbox)
113
  return qa_chain
114
  else:
115
  return 'no_vectorstore'
116
  else:
117
  return agent
118
 
119
+ def delete_collection(all_collections_state, collections_viewer, select_vectorstore_radio, embedding_radio):
120
  if type(embedding_radio) == gr.Radio:
121
  embedding_radio = embedding_radio.value
122
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
123
+ persist_directory_raw = Path('.persisted_data_raw')
124
+ if select_vectorstore_radio == 'Chroma':
125
+ client = chromadb.Client(Settings(
126
+ chroma_db_impl="duckdb+parquet",
127
+ persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
128
+ ))
129
+ for collection in collections_viewer:
130
+ try:
131
+ client.delete_collection(collection)
132
+ all_collections_state.remove(collection)
133
+ collections_viewer.remove(collection)
134
+ except Exception as e:
135
+ logging.error(e)
136
+ if select_vectorstore_radio == 'raw':
137
+ for collection in collections_viewer:
138
+ try:
139
+ os.remove(os.path.join(persist_directory_raw.as_posix(), collection+'.npy' ))
140
+ all_collections_state.remove(collection)
141
+ collections_viewer.remove(collection)
142
+ except Exception as e:
143
+ logging.error(e)
144
  return all_collections_state, collections_viewer
145
 
146
+ def delete_all_collections(all_collections_state, select_vectorstore_radio, embedding_radio):
147
  if type(embedding_radio) == gr.Radio:
148
  embedding_radio = embedding_radio.value
149
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
150
+ persist_directory_raw = Path('.persisted_data_raw')
151
+ if select_vectorstore_radio == 'Chroma':
152
+ shutil.rmtree(persist_directory)
153
+ if select_vectorstore_radio == 'raw':
154
+ shutil.rmtree(persist_directory_raw)
155
  return []
156
 
157
+ def list_collections(all_collections_state, select_vectorstore_radio, embedding_radio):
158
  if type(embedding_radio) == gr.Radio:
159
  embedding_radio = embedding_radio.value
160
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
161
+ persist_directory_raw = Path('.persisted_data_raw')
162
+ if select_vectorstore_radio == 'Chroma':
163
+ client = chromadb.Client(Settings(
164
+ chroma_db_impl="duckdb+parquet",
165
+ persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
166
+ ))
167
+ collection_names = [[c.name][0] for c in client.list_collections()]
168
+ return collection_names
169
+ if select_vectorstore_radio == 'raw':
170
+ if os.path.exists(persist_directory_raw):
171
+ return [f.name.split('.npy')[0] for f in os.scandir(persist_directory_raw)]
172
+ return []
 
 
 
 
 
 
 
 
173
 
174
  def chat(inp, history, agent):
175
  history = history or []
 
216
  lines=1,
217
  value="20",
218
  )
219
+ search_type_selector = gr.Dropdown(
220
+ choices=["similarity", "mmr", "svm"],
221
+ label="Search Type",
222
+ show_label=True,
223
+ value = "similarity"
224
+ )
225
  max_tokens_textbox = gr.Textbox(
226
  placeholder="max_tokens: Maximum number of tokens to generate",
227
  label="max_tokens",
 
242
  examples=[
243
  "What does this code do?",
244
  "I want to change the chat-pykg app to have a log viewer, where the user can see what python is doing in the background. How could I do that?",
245
+ "Hello, I want to allow chat-pykg to search the internet before answering, can you help me change the code to do that? Thanks.",
246
  ],
247
  inputs=message,
248
  )
 
261
  get_all_collection_names_button = gr.Button(value="List all saved repositories", variant="secondary")#.style(full_width=False)
262
  delete_collections_button = gr.Button(value="Delete selected saved repositories", variant="secondary")#.style(full_width=False)
263
  delete_all_collections_button = gr.Button(value="Delete all saved repositories", variant="secondary")#.style(full_width=False)
264
+ with gr.Row():
265
+ select_embedding_radio = gr.Radio(
266
+ choices = ['Sentence Transformers', 'OpenAI'],
267
+ label="Embedding Options",
268
+ show_label=True,
269
+ value='Sentence Transformers'
270
+ )
271
+ select_vectorstore_radio = gr.Radio(
272
+ choices = ['Chroma', 'raw'],
273
+ label="Vectorstore Options",
274
+ show_label=True,
275
+ value='Chroma'
276
+ )
277
  with gr.TabItem("Get New Repositories", id=2):
278
  with gr.Row():
279
  all_collections_to_get = gr.List(headers=['Repository URL', 'Folders'], row_count=3, col_count=2, label='Repositories to get', show_label=True, interactive=True, max_cols=2, max_rows=3)
 
284
  label="Chunk size",
285
  show_label=True,
286
  lines=1,
287
+ value="2000"
288
  )
289
  chunk_overlap_textbox = gr.Textbox(
290
  placeholder="Chunk overlap",
291
  label="Chunk overlap",
292
  show_label=True,
293
  lines=1,
294
+ value="200"
295
  )
296
+ make_embedding_radio = gr.Radio(
297
  choices = ['Sentence Transformers', 'OpenAI'],
298
  label="Embedding Options",
299
  show_label=True,
300
  value='Sentence Transformers'
301
  )
302
+ make_vectorstore_radio = gr.Radio(
303
+ choices = ['Chroma', 'raw'],
304
+ label="Vectorstore Options",
305
+ show_label=True,
306
+ value='Chroma'
307
+ )
308
+
309
  with gr.Row():
310
  gr.HTML('<center>See the <a href=https://python.langchain.com/en/latest/reference/modules/text_splitter.html>Langchain textsplitter docs</a></center>')
 
 
 
311
 
312
  history_state = gr.State()
313
  agent_state = gr.State()
 
316
  chat_state = gr.State()
317
  debug_state = gr.State()
318
  debug_state.value = False
319
+ radio_state = gr.State()
320
 
321
+ submit.click(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, search_type_selector, max_tokens_textbox, select_vectorstore_radio, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
322
+ message.submit(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, search_type_selector, max_tokens_textbox, select_vectorstore_radio, vs_state, agent_state], outputs=[agent_state]).then(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
323
 
324
+ load_collections_button.click(merge_collections, inputs=[collections_viewer, vs_state, k_textbox, search_type_selector, select_vectorstore_radio, select_embedding_radio], outputs=[vs_state])#.then(change_tab, None, tabs) #.then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, vs_state, agent_state], outputs=[agent_state])
325
+ make_collections_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get, chunk_size_textbox, chunk_overlap_textbox, select_vectorstore_radio, select_embedding_radio, debug_state], outputs=[all_collections_state, all_collections_to_get], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
326
+ delete_collections_button.click(delete_collection, inputs=[all_collections_state, collections_viewer, select_vectorstore_radio, select_embedding_radio], outputs=[all_collections_state, collections_viewer]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
327
+ delete_all_collections_button.click(delete_all_collections, inputs=[all_collections_state,select_vectorstore_radio, select_embedding_radio], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
328
+ get_all_collection_names_button.click(list_collections, inputs=[all_collections_state,select_vectorstore_radio, select_embedding_radio], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
329
  clear_btn.click(clear_chat, inputs = [chatbot, history_state], outputs = [chatbot, history_state])
330
+
331
+ make_embedding_radio.change(update_radio, inputs = make_embedding_radio, outputs = select_embedding_radio)
332
+ select_embedding_radio.change(update_radio, inputs = select_embedding_radio, outputs = make_embedding_radio)
333
+ make_vectorstore_radio.change(update_radio, inputs =make_vectorstore_radio, outputs = select_vectorstore_radio)
334
+ select_vectorstore_radio.change(update_radio, inputs = select_vectorstore_radio, outputs = make_vectorstore_radio)
335
+
336
  # Whenever chain parameters change, destroy the agent.
337
+ input_list = [openai_api_key_textbox, model_selector, k_textbox, max_tokens_textbox, select_vectorstore_radio, make_embedding_radio]
338
  output_list = [agent_state]
339
  for input_item in input_list:
340
  input_item.change(
 
342
  inputs=output_list,
343
  outputs=output_list,
344
  )
345
+ all_collections_state.value = list_collections(all_collections_state, select_vectorstore_radio, select_embedding_radio)
346
  block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
347
  log_textbox_handler = LogTextboxHandler(gr.TextArea(interactive=False, placeholder="Logs will appear here...", visible=False))
348
  log_textbox = log_textbox_handler.textbox
 
351
  log_textbox_visibility_state.value = False
352
  log_toggle_button = gr.Button("Toggle Log", variant="secondary")
353
  log_toggle_button.click(toggle_log_textbox, inputs=[log_textbox_visibility_state], outputs=[log_textbox_visibility_state,log_textbox])
354
+
355
+ gr.HTML(
356
+ "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain πŸ¦œοΈπŸ”—</a></center>"
357
+ )
358
  block.queue(concurrency_count=40)
359
  block.launch(debug=True)
chain.py CHANGED
@@ -17,20 +17,20 @@ from langchain.schema import BaseLanguageModel, BaseRetriever, Document
17
  from langchain.prompts.prompt import PromptTemplate
18
 
19
 
20
- # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
21
- # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
22
-
23
- def get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox) -> Chain:
24
-
25
- # def _get_docs(self, question: str, inputs: Dict[str, Any]) -> List[Document]:
26
- # docs = self.retriever.vectorstore._collection.query(question, n_results=self.retriever.search_kwargs["k"], where = {"source":{"$contains":"search_string"}}, where_document = {"$contains":"search_string"})
27
- # return self._reduce_tokens_below_limit(docs)
 
28
 
29
  template = """You are called chat-pykg and are an AI assistant coded in python using langchain and gradio. You are very helpful for answering questions about various open source libraries.
30
  You are given the following extracted parts of code and a question. Provide a conversational answer to the question.
31
  Do NOT make up any hyperlinks that are not in the code.
32
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
33
- If the question is not about the package documentation, politely inform them that you are tuned to only answer questions about the package documentations.
34
  Question: {question}
35
  =========
36
  {context}
@@ -48,13 +48,9 @@ def get_new_chain1(vectorstore, model_selector, k_textbox, max_tokens_textbox) -
48
 
49
  # memory = ConversationKGMemory(llm=llm, input_key="question", output_key="answer")
50
  memory = ConversationBufferWindowMemory(input_key="question", output_key="answer", k=5)
51
- retriever = vectorstore.as_retriever(search_type="similarity")
52
- if len(k_textbox) != 0:
53
- retriever.search_kwargs = {"k": int(k_textbox)}
54
- else:
55
- retriever.search_kwargs = {"k": 10}
56
  qa = ConversationalRetrievalChain(
57
- retriever=retriever, memory=memory, combine_docs_chain=doc_chain, question_generator=question_generator)
58
  # qa._get_docs = _get_docs.__get__(qa, ConversationalRetrievalChain)
59
 
60
  return qa
 
17
  from langchain.prompts.prompt import PromptTemplate
18
 
19
 
20
+ def get_new_chain1(vectorstore, vectorstore_radio, model_selector, k_textbox, search_type_selector, max_tokens_textbox) -> Chain:
21
+ retriever = None
22
+ if vectorstore_radio == 'Chroma':
23
+ retriever = vectorstore.as_retriever(search_type=search_type_selector)
24
+ retriever.search_kwargs = {"k":int(k_textbox)}
25
+ if vectorstore_radio == 'raw':
26
+ if search_type_selector == 'svm':
27
+ retriever = SVMRetriever.from_texts(merged_vectorstore, embedding_function)
28
+ retriever.k = int(k_textbox)
29
 
30
  template = """You are called chat-pykg and are an AI assistant coded in python using langchain and gradio. You are very helpful for answering questions about various open source libraries.
31
  You are given the following extracted parts of code and a question. Provide a conversational answer to the question.
32
  Do NOT make up any hyperlinks that are not in the code.
33
  If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
34
  Question: {question}
35
  =========
36
  {context}
 
48
 
49
  # memory = ConversationKGMemory(llm=llm, input_key="question", output_key="answer")
50
  memory = ConversationBufferWindowMemory(input_key="question", output_key="answer", k=5)
51
+
 
 
 
 
52
  qa = ConversationalRetrievalChain(
53
+ retriever=retriever, memory=memory, combine_docs_chain=doc_chain, question_generator=question_generator, verbose=True, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
54
  # qa._get_docs = _get_docs.__get__(qa, ConversationalRetrievalChain)
55
 
56
  return qa
ingest.py CHANGED
@@ -17,58 +17,7 @@ from pydantic import Extra, Field, root_validator
17
  import logging
18
  logger = logging.getLogger()
19
  from langchain.docstore.document import Document
20
-
21
- # class CachedChroma(Chroma, ABC):
22
- # """
23
- # Wrapper around Chroma to make caching embeddings easier.
24
-
25
- # It automatically uses a cached version of a specified collection, if available.
26
- # Example:
27
- # .. code-block:: python
28
- # from langchain.vectorstores import Chroma
29
- # from langchain.embeddings.openai import OpenAIEmbeddings
30
- # embeddings = OpenAIEmbeddings()
31
- # vectorstore = CachedChroma.from_documents_with_cache(
32
- # ".persisted_data", texts, embeddings, collection_name="fun_experiment"
33
- # )
34
- # """
35
-
36
- # @classmethod
37
- # def from_documents_with_cache(
38
- # cls,
39
- # persist_directory: str,
40
- # documents: Optional[List[Document]] = None,
41
- # embedding: Optional[Embeddings] = None,
42
- # ids: Optional[List[str]] = None,
43
- # collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
44
- # client_settings: Optional[chromadb.config.Settings] = None,
45
- # **kwargs: Any,
46
- # ) -> Chroma:
47
- # client_settings = Settings(
48
- # chroma_db_impl="duckdb+parquet",
49
- # persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
50
- # )
51
- # client = chromadb.Client(client_settings)
52
- # collection_names = [c.name for c in client.list_collections()]
53
-
54
- # if collection_name in collection_names:
55
- # return Chroma(
56
- # collection_name=collection_name,
57
- # embedding_function=embedding,
58
- # persist_directory=persist_directory,
59
- # client_settings=client_settings,
60
- # )
61
- # if documents:
62
- # return Chroma.from_documents(
63
- # documents=documents,
64
- # embedding=embedding,
65
- # ids=ids,
66
- # collection_name=collection_name,
67
- # persist_directory=persist_directory,
68
- # client_settings=client_settings,
69
- # **kwargs
70
- # )
71
- # raise ValueError("Either documents or collection_name must be specified.")
72
 
73
  def embedding_chooser(embedding_radio):
74
  if embedding_radio == "Sentence Transformers":
@@ -133,7 +82,7 @@ def get_text(content):
133
  else:
134
  return ""
135
 
136
- def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, embedding_radio, debug=False):
137
  cleared_list = urls.copy()
138
  def sanitize_folder_name(folder_name):
139
  if folder_name != '':
@@ -164,6 +113,7 @@ def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, embeddin
164
  if orgrepo.replace('/','-') in all_collections_state:
165
  logging.info(f"Skipping {orgrepo} as it is already in the database")
166
  continue
 
167
  documents = []
168
  paths = []
169
  paths_by_ext = {}
@@ -227,21 +177,47 @@ def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, embeddin
227
  continue
228
  for ext in docs_by_ext.keys():
229
  if ext == "py":
230
- documents += py_splitter.split_documents(docs_by_ext[ext])
 
231
  if ext == "md":
232
- documents += md_splitter.split_documents(docs_by_ext[ext])
 
233
  # else:
234
  # documents += text_splitter.split_documents(docs_by_ext[ext]
235
- all_docs += documents
236
  # For each document, add the metadata to the page_content
 
 
 
 
 
 
237
  for doc in documents:
 
 
 
 
238
  doc.page_content = f'# source:{doc.metadata["source"]}\n{doc.page_content}'
 
239
  if type(embedding_radio) == gr.Radio:
240
  embedding_radio = embedding_radio.value
241
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
 
 
242
  collection_name = orgrepo.replace('/','-')
243
- collection = Chroma.from_documents(documents=documents, collection_name=collection_name, embedding=embedding_function, persist_directory=persist_directory)
244
- collection.persist()
 
 
 
 
 
 
 
 
 
 
 
245
  all_collections_state.append(collection_name)
246
  cleared_list[j][0], cleared_list[j][1] = '', ''
247
  return all_collections_state, gr.update(value=cleared_list)
 
17
  import logging
18
  logger = logging.getLogger()
19
  from langchain.docstore.document import Document
20
+ import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def embedding_chooser(embedding_radio):
23
  if embedding_radio == "Sentence Transformers":
 
82
  else:
83
  return ""
84
 
85
+ def ingest_docs(all_collections_state, urls, chunk_size, chunk_overlap, vectorstore_radio, embedding_radio, debug=False):
86
  cleared_list = urls.copy()
87
  def sanitize_folder_name(folder_name):
88
  if folder_name != '':
 
113
  if orgrepo.replace('/','-') in all_collections_state:
114
  logging.info(f"Skipping {orgrepo} as it is already in the database")
115
  continue
116
+ documents_split = []
117
  documents = []
118
  paths = []
119
  paths_by_ext = {}
 
177
  continue
178
  for ext in docs_by_ext.keys():
179
  if ext == "py":
180
+ documents_split += py_splitter.split_documents(docs_by_ext[ext])
181
+ documents += docs_by_ext[ext]
182
  if ext == "md":
183
+ documents_split += md_splitter.split_documents(docs_by_ext[ext])
184
+ documents += docs_by_ext[ext]
185
  # else:
186
  # documents += text_splitter.split_documents(docs_by_ext[ext]
187
+ all_docs += documents_split
188
  # For each document, add the metadata to the page_content
189
+ for doc in documents_split:
190
+ if local_repo_path != '.':
191
+ doc.metadata["source"] = doc.metadata["source"].replace(local_repo_path, "")
192
+ if doc.metadata["source"] == '/':
193
+ doc.metadata["source"] = doc.metadata["source"][1:]
194
+ doc.page_content = f'# source:{doc.metadata["source"]}\n{doc.page_content}'
195
  for doc in documents:
196
+ if local_repo_path != '.':
197
+ doc.metadata["source"] = doc.metadata["source"].replace(local_repo_path, "")
198
+ if doc.metadata["source"] == '/':
199
+ doc.metadata["source"] = doc.metadata["source"][1:]
200
  doc.page_content = f'# source:{doc.metadata["source"]}\n{doc.page_content}'
201
+
202
  if type(embedding_radio) == gr.Radio:
203
  embedding_radio = embedding_radio.value
204
  persist_directory = os.path.join(".persisted_data", embedding_radio.replace(' ','_'))
205
+ persist_directory_raw = Path('.persisted_data_raw')
206
+ persist_directory_raw.mkdir(parents=True, exist_ok=True)
207
  collection_name = orgrepo.replace('/','-')
208
+
209
+ if vectorstore_radio == 'Chroma':
210
+ collection = Chroma.from_documents(documents=documents_split, collection_name=collection_name, embedding=embedding_function, persist_directory=persist_directory)
211
+ collection.persist()
212
+
213
+ if vectorstore_radio == 'raw':
214
+ # Persist the raw documents
215
+ docarr = np.array([doc.page_content for doc in documents_split])
216
+ np.save(os.path.join(persist_directory_raw, f"{collection_name}.npy"), docarr)
217
+ # with open(os.path.join(persist_directory_raw, f"{collection_name}"), "w") as f:
218
+ # for doc in documents:
219
+ # f.write(doc.page_content)
220
+
221
  all_collections_state.append(collection_name)
222
  cleared_list[j][0], cleared_list[j][1] = '', ''
223
  return all_collections_state, gr.update(value=cleared_list)