xangma commited on
Commit
a835cf0
·
1 Parent(s): 969f5dc
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +129 -108
  3. ingest.py +85 -81
.gitignore CHANGED
@@ -1,3 +1,4 @@
 
1
  .persisted_data/*
2
  downloaded/*
3
  __pycache__/*
 
1
+ .chroma/*
2
  .persisted_data/*
3
  downloaded/*
4
  __pycache__/*
app.py CHANGED
@@ -18,12 +18,36 @@ from langchain.embeddings.base import Embeddings
18
  import shutil
19
  import random, string
20
  from chain import get_new_chain1
21
- from ingest import ingest_docs, CachedChroma
22
 
23
  def randomword(length):
24
  letters = string.ascii_lowercase
25
  return ''.join(random.choice(letters) for i in range(length))
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
28
  if vectorstore == None:
29
  return 'no_vectorstore'
@@ -40,34 +64,31 @@ def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
40
  qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
41
  return qa_chain
42
 
43
- def get_vectorstore(chat_state, collection_textbox, vs_state):
44
- embeddings = HuggingFaceEmbeddings()
45
- vectorstore = CachedChroma.from_documents_with_cache(persist_directory=".persisted_data", documents=None, embedding = embeddings, collection_name=collection_textbox)
46
- return vectorstore
47
-
48
- def make_vectorstore(chat_state,collection_name, packagedocslist, vs_state):
49
- vectorstore = ingest_docs(collection_name, packagedocslist)
50
- return vectorstore
51
-
52
- def delete_vs(chat_state, collection_textbox):
53
  client = chromadb.Client(Settings(
54
  chroma_db_impl="duckdb+parquet",
55
  persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
56
  ))
57
- client.delete_collection(collection_textbox)
 
 
 
58
 
59
- def delete_all_vs(chat_state):
60
  shutil.rmtree(".persisted_data")
61
- return "all_vs_deleted"
62
 
63
- def get_all_vs_names(chat_state):
64
  client = chromadb.Client(Settings(
65
  chroma_db_impl="duckdb+parquet",
66
  persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
67
  ))
68
- collection_names = [c.name for c in client.list_collections()]
69
- # print the collection names to the chatbot
70
- return collection_names, "all_collections"
 
 
 
71
 
72
  def chat(inp, history, agent):
73
  history = history or []
@@ -97,98 +118,98 @@ def chat(inp, history, agent):
97
  block = gr.Blocks(css=".gradio-container {background-color: system;}")
98
 
99
  with block:
100
- with gr.Row():
101
- gr.Markdown("<h3><center>Package docs Assistant</center></h3>")
102
-
103
- openai_api_key_textbox = gr.Textbox(
104
- placeholder="Paste your OpenAI API key (sk-...)",
105
- show_label=False,
106
- lines=1,
107
- type="password",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  )
109
- model_selector = gr.Dropdown(["gpt-3.5-turbo", "gpt-4", "other"], label="Model", show_label=True)
110
- model_selector.value = "gpt-3.5-turbo"
111
- k_textbox = gr.Textbox(
112
- placeholder="k: Number of search results to consider",
113
- label="Search Results k:",
114
- show_label=True,
115
- lines=1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  )
117
- k_textbox.value = "10"
118
- chatbot = gr.Chatbot()
119
- with gr.Row():
120
- message = gr.Textbox(
121
- label="What's your question?",
122
- placeholder="What is this code?",
123
- lines=1,
124
  )
125
- submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
126
- with gr.Row():
127
- with gr.Column(scale=4):
128
- packagedocslist = gr.List(headers=['Package Docs URL'],row_count=5, label='Package docs URLs', show_label=True, interactive=True, max_cols=1, max_rows=5)
129
- with gr.Column(scale=1):
130
- randomname = randomword(5)
131
- collection_textbox = gr.Textbox(placeholder=randomname,
132
- label="Collection name:",
133
- show_label=True,
134
- lines=1,
135
  )
136
- collection_textbox.value = randomname
137
- get_vs_button = gr.Button(value="Get vectorstore", variant="secondary").style(full_width=False)
138
- make_vs_button = gr.Button(value="Make vectorstore", variant="secondary").style(full_width=False)
139
- delete_vs_button = gr.Button(value="Delete vectorstore", variant="secondary").style(full_width=False)
140
- delete_all_vs_button = gr.Button(value="Delete all vectorstores", variant="secondary").style(full_width=False)
141
- get_all_vs_names_button = gr.Button(value="Get all vectorstore names", variant="secondary").style(full_width=False)
142
-
143
- gr.Examples(
144
- examples=[
145
- "What is this code and why hasn't the developer documented it?",
146
- "Where is this specific method in the source code and why is it broken?"
147
- ],
148
- inputs=message,
149
- )
150
-
151
- gr.HTML(
152
- """
153
- This simple application is an implementation of ChatGPT but over an external dataset.
154
- The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
155
- The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
156
- )
157
-
158
- gr.HTML(
159
- "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
160
- )
161
-
162
- history_state = gr.State()
163
- agent_state = gr.State()
164
- vs_state = gr.State()
165
- all_collections = gr.State()
166
- chat_state = gr.State()
167
-
168
- submit.click(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
169
- message.submit(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
170
-
171
- get_vs_button.click(get_vectorstore, inputs=[chat_state,collection_textbox, vs_state], outputs=[vs_state]).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state])
172
- make_vs_button.click(make_vectorstore, inputs=[chat_state,collection_textbox, packagedocslist, vs_state], outputs=[vs_state], show_progress=True).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state])
173
- delete_vs_button.click(delete_vs, inputs=[chat_state,collection_textbox], outputs=[])
174
- delete_all_vs_button.click(delete_all_vs, inputs=[chat_state], outputs=[chat_state]).then(chat, inputs=[all_collections, history_state, chat_state], outputs=[chatbot, history_state])
175
- get_all_vs_names_button.click(get_all_vs_names, inputs=[chat_state], outputs=[all_collections, chat_state]).then(chat, inputs=[all_collections, history_state, chat_state], outputs=[chatbot, history_state])
176
-
177
- #I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
178
- openai_api_key_textbox.change(
179
- set_chain_up,
180
- inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
181
- outputs=[agent_state],
182
- )
183
- model_selector.change(
184
- set_chain_up,
185
- inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
186
- outputs=[agent_state],
187
- )
188
- k_textbox.change(
189
- set_chain_up,
190
- inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
191
- outputs=[agent_state],
192
- )
193
-
194
  block.launch(debug=True)
 
18
  import shutil
19
  import random, string
20
  from chain import get_new_chain1
21
+ from ingest import ingest_docs
22
 
23
  def randomword(length):
24
  letters = string.ascii_lowercase
25
  return ''.join(random.choice(letters) for i in range(length))
26
 
27
+ def change_tab():
28
+ return gr.Tabs.update(selected=0)
29
+
30
+ def merge_collections(collection_load_names, vs_state):
31
+ merged_documents = []
32
+ merged_embeddings = []
33
+ client = chromadb.Client(Settings(
34
+ chroma_db_impl="duckdb+parquet",
35
+ persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
36
+ ))
37
+
38
+ for collection_name in collection_load_names:
39
+ collection_name = collection_name
40
+ if collection_name == '':
41
+ continue
42
+ collection = client.get_collection(collection_name)
43
+ collection = collection.get(include=["metadatas", "documents", "embeddings"])
44
+ for i in range(len(collection['documents'])):
45
+ merged_documents.append(Document(page_content=collection['documents'][i], metadata = collection['metadatas'][i]))
46
+ merged_embeddings.append(collection['embeddings'][i])
47
+ merged_collection_name = "merged_collection"
48
+ merged_vectorstore = Chroma.from_documents(documents=merged_documents, embeddings=merged_embeddings, collection_name=merged_collection_name)
49
+ return merged_vectorstore
50
+
51
  def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
52
  if vectorstore == None:
53
  return 'no_vectorstore'
 
64
  qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
65
  return qa_chain
66
 
67
+ def delete_vs(all_collections_state, collections_viewer):
 
 
 
 
 
 
 
 
 
68
  client = chromadb.Client(Settings(
69
  chroma_db_impl="duckdb+parquet",
70
  persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
71
  ))
72
+ for collection in collections_viewer:
73
+ client.delete_collection(collection)
74
+ all_collections_state.remove(collection)
75
+ return all_collections_state
76
 
77
+ def delete_all_vs(all_collections_state):
78
  shutil.rmtree(".persisted_data")
79
+ return []
80
 
81
+ def list_collections(all_collections_state):
82
  client = chromadb.Client(Settings(
83
  chroma_db_impl="duckdb+parquet",
84
  persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
85
  ))
86
+ collection_names = [[c.name][0] for c in client.list_collections()]
87
+ return collection_names
88
+
89
+ def update_checkboxgroup(all_collections_state):
90
+ new_options = [i for i in all_collections_state]
91
+ return gr.CheckboxGroup.update(choices=new_options)
92
 
93
  def chat(inp, history, agent):
94
  history = history or []
 
118
  block = gr.Blocks(css=".gradio-container {background-color: system;}")
119
 
120
  with block:
121
+ gr.Markdown("<h3><center>chat-pykg</center></h3>")
122
+ with gr.Tabs() as tabs:
123
+ with gr.TabItem("Chat", id=0):
124
+ with gr.Row():
125
+ openai_api_key_textbox = gr.Textbox(
126
+ placeholder="Paste your OpenAI API key (sk-...)",
127
+ show_label=False,
128
+ lines=1,
129
+ type="password",
130
+ )
131
+ model_selector = gr.Dropdown(["gpt-3.5-turbo", "gpt-4", "other"], label="Model", show_label=True)
132
+ model_selector.value = "gpt-3.5-turbo"
133
+ k_textbox = gr.Textbox(
134
+ placeholder="k: Number of search results to consider",
135
+ label="Search Results k:",
136
+ show_label=True,
137
+ lines=1,
138
+ )
139
+ k_textbox.value = "10"
140
+ chatbot = gr.Chatbot()
141
+ with gr.Row():
142
+ message = gr.Textbox(
143
+ label="What's your question?",
144
+ placeholder="What is this code?",
145
+ lines=1,
146
+ )
147
+ submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
148
+ gr.Examples(
149
+ examples=[
150
+ "What is this code and why hasn't the developer documented it?",
151
+ "Where is this specific method in the source code and why is it broken?"
152
+ ],
153
+ inputs=message,
154
+ )
155
+
156
+ gr.HTML(
157
+ """
158
+ This simple application is an implementation of ChatGPT but over an external dataset.
159
+ The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
160
+ The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
161
+ )
162
+ with gr.TabItem("Collections manager", id=1):
163
+ #with gr.Row():
164
+ #collection_load_list = gr.List(headers=['Collection Loader'],row_count=5, label='Package docs URLs', show_label=True, interactive=True, max_cols=1, max_rows=5)
165
+
166
+ with gr.Row():
167
+ with gr.Column(scale=2):
168
+ all_collections_to_get = gr.List(headers=['New Collections to make'],row_count=3, label='Collections_to_get', show_label=True, interactive=True, max_cols=1, max_rows=3)
169
+ make_vs_button = gr.Button(value="Make new collection(s)", variant="secondary").style(full_width=False)
170
+ with gr.Column(scale=2):
171
+ collections_viewer = gr.CheckboxGroup(choices=[], label='Collections_viewer', show_label=True)
172
+ #all_collections_viewer = gr.List(headers=['Existing Collections Viewer'],row_count=7, label='Collections_viewer', show_label=True, max_cols=1)
173
+ with gr.Column(scale=1):
174
+ get_vs_button = gr.Button(value="Load collection(s) to chat!", variant="secondary").style(full_width=False)
175
+ get_all_vs_names_button = gr.Button(value="List all saved collections", variant="secondary").style(full_width=False)
176
+ delete_vs_button = gr.Button(value="Delete selected saved collections", variant="secondary").style(full_width=False)
177
+ delete_all_vs_button = gr.Button(value="Delete all saved collections", variant="secondary").style(full_width=False)
178
+ gr.HTML(
179
+ "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
180
  )
181
+
182
+ history_state = gr.State()
183
+ agent_state = gr.State()
184
+ vs_state = gr.State()
185
+ all_collections_state = gr.State()
186
+ chat_state = gr.State()
187
+
188
+ submit.click(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
189
+ message.submit(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
190
+
191
+ get_vs_button.click(merge_collections, inputs=[collections_viewer, vs_state], outputs=[vs_state]).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state]).then(change_tab, None, tabs)
192
+ make_vs_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get], outputs=[all_collections_state], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
193
+ delete_vs_button.click(delete_vs, inputs=[all_collections_state, collections_viewer], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
194
+ delete_all_vs_button.click(delete_all_vs, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
195
+ get_all_vs_names_button.click(list_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
196
+
197
+ #I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
198
+ openai_api_key_textbox.change(
199
+ set_chain_up,
200
+ inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
201
+ outputs=[agent_state],
202
  )
203
+ model_selector.change(
204
+ set_chain_up,
205
+ inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
206
+ outputs=[agent_state],
 
 
 
207
  )
208
+ k_textbox.change(
209
+ set_chain_up,
210
+ inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
211
+ outputs=[agent_state],
 
 
 
 
 
 
212
  )
213
+ all_collections_state.value = list_collections(all_collections_state)
214
+ block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  block.launch(debug=True)
ingest.py CHANGED
@@ -18,57 +18,57 @@ from langchain.docstore.document import Document
18
  from langchain.embeddings.base import Embeddings
19
  from chromadb.config import Settings
20
 
21
- class CachedChroma(Chroma, ABC):
22
- """
23
- Wrapper around Chroma to make caching embeddings easier.
24
 
25
- It automatically uses a cached version of a specified collection, if available.
26
- Example:
27
- .. code-block:: python
28
- from langchain.vectorstores import Chroma
29
- from langchain.embeddings.openai import OpenAIEmbeddings
30
- embeddings = OpenAIEmbeddings()
31
- vectorstore = CachedChroma.from_documents_with_cache(
32
- ".persisted_data", texts, embeddings, collection_name="fun_experiment"
33
- )
34
- """
35
 
36
- @classmethod
37
- def from_documents_with_cache(
38
- cls,
39
- persist_directory: str,
40
- documents: Optional[List[Document]] = None,
41
- embedding: Optional[Embeddings] = None,
42
- ids: Optional[List[str]] = None,
43
- collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
44
- client_settings: Optional[chromadb.config.Settings] = None,
45
- **kwargs: Any,
46
- ) -> Chroma:
47
- client_settings = Settings(
48
- chroma_db_impl="duckdb+parquet",
49
- persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
50
- )
51
- client = chromadb.Client(client_settings)
52
- collection_names = [c.name for c in client.list_collections()]
53
 
54
- if collection_name in collection_names:
55
- return Chroma(
56
- collection_name=collection_name,
57
- embedding_function=embedding,
58
- persist_directory=persist_directory,
59
- client_settings=client_settings,
60
- )
61
- if documents:
62
- return Chroma.from_documents(
63
- documents=documents,
64
- embedding=embedding,
65
- ids=ids,
66
- collection_name=collection_name,
67
- persist_directory=persist_directory,
68
- client_settings=client_settings,
69
- **kwargs
70
- )
71
- raise ValueError("Either documents or collection_name must be specified.")
72
 
73
  def get_text(content):
74
  relevant_part = content.find("div", {"class": "markdown"})
@@ -77,29 +77,32 @@ def get_text(content):
77
  else:
78
  return ""
79
 
80
- def get_docs(urls):
81
- cwd = os.getcwd()
 
 
82
  folders=[]
83
  documents = []
84
  shutil.rmtree('downloaded/', ignore_errors=True)
85
  known_exts = ["py", "md"]
86
- paths_by_ext = {}
87
- docs_by_ext = {}
88
- for ext in known_exts + ["other"]:
89
- docs_by_ext[ext] = []
90
- paths_by_ext[ext] = []
91
  py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
92
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
93
  md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
94
  for url in urls:
 
 
 
 
 
95
  url = url[0]
96
  if url == '':
97
  continue
98
  if "." in url:
 
99
  if len(url) > 1:
100
- folders.append(url.split('.')[1])
101
  else:
102
- folders.append('.')
103
  else:
104
  destination = Path('downloaded/'+url)
105
  destination.mkdir(exist_ok=True, parents=True)
@@ -135,9 +138,7 @@ def get_docs(urls):
135
  if res.returncode == 1:
136
  res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
137
  res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
138
- folders.append(destination)
139
-
140
- for folder in folders:
141
  local_repo_path_1 = folder
142
  for root, dirs, files in os.walk(local_repo_path_1):
143
  for file in files:
@@ -154,28 +155,31 @@ def get_docs(urls):
154
  docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
155
  except Exception as e:
156
  continue
157
- for ext in docs_by_ext.keys():
158
- if ext == "py":
159
- documents += py_splitter.split_documents(docs_by_ext[ext])
160
- if ext == "md":
161
- documents += md_splitter.split_documents(docs_by_ext[ext])
162
- # else:
163
- # documents += text_splitter.split_documents(docs_by_ext[ext]
164
- return documents
165
-
166
- def ingest_docs(collection_name, urls=[]):
167
- """Get documents from web pages."""
168
-
169
- documents = get_docs(urls)
170
- embeddings = HuggingFaceEmbeddings()
171
- vectorstore = CachedChroma.from_documents_with_cache(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name=collection_name)
172
- vectorstore.persist()
173
- #vectorstore = FAISS.from_documents(documents, embeddings)
174
- # # Save vectorstore
175
- # with open("vectorstore.pkl", "wb") as f:
176
- # pickle.dump(vectorstore. , f)
 
 
 
177
 
178
- return vectorstore
179
 
180
 
181
  if __name__ == "__main__":
 
18
  from langchain.embeddings.base import Embeddings
19
  from chromadb.config import Settings
20
 
21
+ # class CachedChroma(Chroma, ABC):
22
+ # """
23
+ # Wrapper around Chroma to make caching embeddings easier.
24
 
25
+ # It automatically uses a cached version of a specified collection, if available.
26
+ # Example:
27
+ # .. code-block:: python
28
+ # from langchain.vectorstores import Chroma
29
+ # from langchain.embeddings.openai import OpenAIEmbeddings
30
+ # embeddings = OpenAIEmbeddings()
31
+ # vectorstore = CachedChroma.from_documents_with_cache(
32
+ # ".persisted_data", texts, embeddings, collection_name="fun_experiment"
33
+ # )
34
+ # """
35
 
36
+ # @classmethod
37
+ # def from_documents_with_cache(
38
+ # cls,
39
+ # persist_directory: str,
40
+ # documents: Optional[List[Document]] = None,
41
+ # embedding: Optional[Embeddings] = None,
42
+ # ids: Optional[List[str]] = None,
43
+ # collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
44
+ # client_settings: Optional[chromadb.config.Settings] = None,
45
+ # **kwargs: Any,
46
+ # ) -> Chroma:
47
+ # client_settings = Settings(
48
+ # chroma_db_impl="duckdb+parquet",
49
+ # persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
50
+ # )
51
+ # client = chromadb.Client(client_settings)
52
+ # collection_names = [c.name for c in client.list_collections()]
53
 
54
+ # if collection_name in collection_names:
55
+ # return Chroma(
56
+ # collection_name=collection_name,
57
+ # embedding_function=embedding,
58
+ # persist_directory=persist_directory,
59
+ # client_settings=client_settings,
60
+ # )
61
+ # if documents:
62
+ # return Chroma.from_documents(
63
+ # documents=documents,
64
+ # embedding=embedding,
65
+ # ids=ids,
66
+ # collection_name=collection_name,
67
+ # persist_directory=persist_directory,
68
+ # client_settings=client_settings,
69
+ # **kwargs
70
+ # )
71
+ # raise ValueError("Either documents or collection_name must be specified.")
72
 
73
  def get_text(content):
74
  relevant_part = content.find("div", {"class": "markdown"})
 
77
  else:
78
  return ""
79
 
80
+ def ingest_docs(all_collections_state, urls):
81
+ """Get documents from web pages."""
82
+ all_docs = []
83
+ local = False
84
  folders=[]
85
  documents = []
86
  shutil.rmtree('downloaded/', ignore_errors=True)
87
  known_exts = ["py", "md"]
 
 
 
 
 
88
  py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
89
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
90
  md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
91
  for url in urls:
92
+ paths_by_ext = {}
93
+ docs_by_ext = {}
94
+ for ext in known_exts + ["other"]:
95
+ docs_by_ext[ext] = []
96
+ paths_by_ext[ext] = []
97
  url = url[0]
98
  if url == '':
99
  continue
100
  if "." in url:
101
+ local = True
102
  if len(url) > 1:
103
+ folder = url.split('.')[1]
104
  else:
105
+ folder = '.'
106
  else:
107
  destination = Path('downloaded/'+url)
108
  destination.mkdir(exist_ok=True, parents=True)
 
138
  if res.returncode == 1:
139
  res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
140
  res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
141
+ folder = destination
 
 
142
  local_repo_path_1 = folder
143
  for root, dirs, files in os.walk(local_repo_path_1):
144
  for file in files:
 
155
  docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
156
  except Exception as e:
157
  continue
158
+ for ext in docs_by_ext.keys():
159
+ if ext == "py":
160
+ documents += py_splitter.split_documents(docs_by_ext[ext])
161
+ if ext == "md":
162
+ documents += md_splitter.split_documents(docs_by_ext[ext])
163
+ # else:
164
+ # documents += text_splitter.split_documents(docs_by_ext[ext]
165
+ all_docs += documents
166
+ embeddings = HuggingFaceEmbeddings()
167
+ if 'downloaded/' in folder:
168
+ folder = '-'.join(folder.split('/')[1:])
169
+ if folder == '.':
170
+ folder = 'chat-pykg'
171
+ vectorstore = Chroma.from_documents(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name=folder)
172
+ vectorstore.persist()
173
+ all_collections_state.append(folder)
174
+ return all_collections_state
175
+ # embeddings = HuggingFaceEmbeddings()
176
+ # merged_vectorstore = Chroma.from_documents(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name='merged_collections')
177
+ # #vectorstore = FAISS.from_documents(documents, embeddings)
178
+ # # # Save vectorstore
179
+ # # with open("vectorstore.pkl", "wb") as f:
180
+ # # pickle.dump(vectorstore. , f)
181
 
182
+ # return merged_vectorstore
183
 
184
 
185
  if __name__ == "__main__":