xangma
commited on
Commit
·
a835cf0
1
Parent(s):
969f5dc
refactor
Browse files- .gitignore +1 -0
- app.py +129 -108
- ingest.py +85 -81
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
.persisted_data/*
|
2 |
downloaded/*
|
3 |
__pycache__/*
|
|
|
1 |
+
.chroma/*
|
2 |
.persisted_data/*
|
3 |
downloaded/*
|
4 |
__pycache__/*
|
app.py
CHANGED
@@ -18,12 +18,36 @@ from langchain.embeddings.base import Embeddings
|
|
18 |
import shutil
|
19 |
import random, string
|
20 |
from chain import get_new_chain1
|
21 |
-
from ingest import ingest_docs
|
22 |
|
23 |
def randomword(length):
|
24 |
letters = string.ascii_lowercase
|
25 |
return ''.join(random.choice(letters) for i in range(length))
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
|
28 |
if vectorstore == None:
|
29 |
return 'no_vectorstore'
|
@@ -40,34 +64,31 @@ def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
|
|
40 |
qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
|
41 |
return qa_chain
|
42 |
|
43 |
-
def
|
44 |
-
embeddings = HuggingFaceEmbeddings()
|
45 |
-
vectorstore = CachedChroma.from_documents_with_cache(persist_directory=".persisted_data", documents=None, embedding = embeddings, collection_name=collection_textbox)
|
46 |
-
return vectorstore
|
47 |
-
|
48 |
-
def make_vectorstore(chat_state,collection_name, packagedocslist, vs_state):
|
49 |
-
vectorstore = ingest_docs(collection_name, packagedocslist)
|
50 |
-
return vectorstore
|
51 |
-
|
52 |
-
def delete_vs(chat_state, collection_textbox):
|
53 |
client = chromadb.Client(Settings(
|
54 |
chroma_db_impl="duckdb+parquet",
|
55 |
persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
|
56 |
))
|
57 |
-
|
|
|
|
|
|
|
58 |
|
59 |
-
def delete_all_vs(
|
60 |
shutil.rmtree(".persisted_data")
|
61 |
-
return
|
62 |
|
63 |
-
def
|
64 |
client = chromadb.Client(Settings(
|
65 |
chroma_db_impl="duckdb+parquet",
|
66 |
persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
|
67 |
))
|
68 |
-
collection_names = [c.name for c in client.list_collections()]
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
71 |
|
72 |
def chat(inp, history, agent):
|
73 |
history = history or []
|
@@ -97,98 +118,98 @@ def chat(inp, history, agent):
|
|
97 |
block = gr.Blocks(css=".gradio-container {background-color: system;}")
|
98 |
|
99 |
with block:
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
)
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
)
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
label="What's your question?",
|
122 |
-
placeholder="What is this code?",
|
123 |
-
lines=1,
|
124 |
)
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
with gr.Column(scale=1):
|
130 |
-
randomname = randomword(5)
|
131 |
-
collection_textbox = gr.Textbox(placeholder=randomname,
|
132 |
-
label="Collection name:",
|
133 |
-
show_label=True,
|
134 |
-
lines=1,
|
135 |
)
|
136 |
-
|
137 |
-
|
138 |
-
make_vs_button = gr.Button(value="Make vectorstore", variant="secondary").style(full_width=False)
|
139 |
-
delete_vs_button = gr.Button(value="Delete vectorstore", variant="secondary").style(full_width=False)
|
140 |
-
delete_all_vs_button = gr.Button(value="Delete all vectorstores", variant="secondary").style(full_width=False)
|
141 |
-
get_all_vs_names_button = gr.Button(value="Get all vectorstore names", variant="secondary").style(full_width=False)
|
142 |
-
|
143 |
-
gr.Examples(
|
144 |
-
examples=[
|
145 |
-
"What is this code and why hasn't the developer documented it?",
|
146 |
-
"Where is this specific method in the source code and why is it broken?"
|
147 |
-
],
|
148 |
-
inputs=message,
|
149 |
-
)
|
150 |
-
|
151 |
-
gr.HTML(
|
152 |
-
"""
|
153 |
-
This simple application is an implementation of ChatGPT but over an external dataset.
|
154 |
-
The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
|
155 |
-
The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
|
156 |
-
)
|
157 |
-
|
158 |
-
gr.HTML(
|
159 |
-
"<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
|
160 |
-
)
|
161 |
-
|
162 |
-
history_state = gr.State()
|
163 |
-
agent_state = gr.State()
|
164 |
-
vs_state = gr.State()
|
165 |
-
all_collections = gr.State()
|
166 |
-
chat_state = gr.State()
|
167 |
-
|
168 |
-
submit.click(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
|
169 |
-
message.submit(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
|
170 |
-
|
171 |
-
get_vs_button.click(get_vectorstore, inputs=[chat_state,collection_textbox, vs_state], outputs=[vs_state]).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state])
|
172 |
-
make_vs_button.click(make_vectorstore, inputs=[chat_state,collection_textbox, packagedocslist, vs_state], outputs=[vs_state], show_progress=True).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state])
|
173 |
-
delete_vs_button.click(delete_vs, inputs=[chat_state,collection_textbox], outputs=[])
|
174 |
-
delete_all_vs_button.click(delete_all_vs, inputs=[chat_state], outputs=[chat_state]).then(chat, inputs=[all_collections, history_state, chat_state], outputs=[chatbot, history_state])
|
175 |
-
get_all_vs_names_button.click(get_all_vs_names, inputs=[chat_state], outputs=[all_collections, chat_state]).then(chat, inputs=[all_collections, history_state, chat_state], outputs=[chatbot, history_state])
|
176 |
-
|
177 |
-
#I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
|
178 |
-
openai_api_key_textbox.change(
|
179 |
-
set_chain_up,
|
180 |
-
inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
|
181 |
-
outputs=[agent_state],
|
182 |
-
)
|
183 |
-
model_selector.change(
|
184 |
-
set_chain_up,
|
185 |
-
inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
|
186 |
-
outputs=[agent_state],
|
187 |
-
)
|
188 |
-
k_textbox.change(
|
189 |
-
set_chain_up,
|
190 |
-
inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
|
191 |
-
outputs=[agent_state],
|
192 |
-
)
|
193 |
-
|
194 |
block.launch(debug=True)
|
|
|
18 |
import shutil
|
19 |
import random, string
|
20 |
from chain import get_new_chain1
|
21 |
+
from ingest import ingest_docs
|
22 |
|
23 |
def randomword(length):
|
24 |
letters = string.ascii_lowercase
|
25 |
return ''.join(random.choice(letters) for i in range(length))
|
26 |
|
27 |
+
def change_tab():
|
28 |
+
return gr.Tabs.update(selected=0)
|
29 |
+
|
30 |
+
def merge_collections(collection_load_names, vs_state):
|
31 |
+
merged_documents = []
|
32 |
+
merged_embeddings = []
|
33 |
+
client = chromadb.Client(Settings(
|
34 |
+
chroma_db_impl="duckdb+parquet",
|
35 |
+
persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
|
36 |
+
))
|
37 |
+
|
38 |
+
for collection_name in collection_load_names:
|
39 |
+
collection_name = collection_name
|
40 |
+
if collection_name == '':
|
41 |
+
continue
|
42 |
+
collection = client.get_collection(collection_name)
|
43 |
+
collection = collection.get(include=["metadatas", "documents", "embeddings"])
|
44 |
+
for i in range(len(collection['documents'])):
|
45 |
+
merged_documents.append(Document(page_content=collection['documents'][i], metadata = collection['metadatas'][i]))
|
46 |
+
merged_embeddings.append(collection['embeddings'][i])
|
47 |
+
merged_collection_name = "merged_collection"
|
48 |
+
merged_vectorstore = Chroma.from_documents(documents=merged_documents, embeddings=merged_embeddings, collection_name=merged_collection_name)
|
49 |
+
return merged_vectorstore
|
50 |
+
|
51 |
def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
|
52 |
if vectorstore == None:
|
53 |
return 'no_vectorstore'
|
|
|
64 |
qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
|
65 |
return qa_chain
|
66 |
|
67 |
+
def delete_vs(all_collections_state, collections_viewer):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
client = chromadb.Client(Settings(
|
69 |
chroma_db_impl="duckdb+parquet",
|
70 |
persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
|
71 |
))
|
72 |
+
for collection in collections_viewer:
|
73 |
+
client.delete_collection(collection)
|
74 |
+
all_collections_state.remove(collection)
|
75 |
+
return all_collections_state
|
76 |
|
77 |
+
def delete_all_vs(all_collections_state):
|
78 |
shutil.rmtree(".persisted_data")
|
79 |
+
return []
|
80 |
|
81 |
+
def list_collections(all_collections_state):
|
82 |
client = chromadb.Client(Settings(
|
83 |
chroma_db_impl="duckdb+parquet",
|
84 |
persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
|
85 |
))
|
86 |
+
collection_names = [[c.name][0] for c in client.list_collections()]
|
87 |
+
return collection_names
|
88 |
+
|
89 |
+
def update_checkboxgroup(all_collections_state):
|
90 |
+
new_options = [i for i in all_collections_state]
|
91 |
+
return gr.CheckboxGroup.update(choices=new_options)
|
92 |
|
93 |
def chat(inp, history, agent):
|
94 |
history = history or []
|
|
|
118 |
block = gr.Blocks(css=".gradio-container {background-color: system;}")
|
119 |
|
120 |
with block:
|
121 |
+
gr.Markdown("<h3><center>chat-pykg</center></h3>")
|
122 |
+
with gr.Tabs() as tabs:
|
123 |
+
with gr.TabItem("Chat", id=0):
|
124 |
+
with gr.Row():
|
125 |
+
openai_api_key_textbox = gr.Textbox(
|
126 |
+
placeholder="Paste your OpenAI API key (sk-...)",
|
127 |
+
show_label=False,
|
128 |
+
lines=1,
|
129 |
+
type="password",
|
130 |
+
)
|
131 |
+
model_selector = gr.Dropdown(["gpt-3.5-turbo", "gpt-4", "other"], label="Model", show_label=True)
|
132 |
+
model_selector.value = "gpt-3.5-turbo"
|
133 |
+
k_textbox = gr.Textbox(
|
134 |
+
placeholder="k: Number of search results to consider",
|
135 |
+
label="Search Results k:",
|
136 |
+
show_label=True,
|
137 |
+
lines=1,
|
138 |
+
)
|
139 |
+
k_textbox.value = "10"
|
140 |
+
chatbot = gr.Chatbot()
|
141 |
+
with gr.Row():
|
142 |
+
message = gr.Textbox(
|
143 |
+
label="What's your question?",
|
144 |
+
placeholder="What is this code?",
|
145 |
+
lines=1,
|
146 |
+
)
|
147 |
+
submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
|
148 |
+
gr.Examples(
|
149 |
+
examples=[
|
150 |
+
"What is this code and why hasn't the developer documented it?",
|
151 |
+
"Where is this specific method in the source code and why is it broken?"
|
152 |
+
],
|
153 |
+
inputs=message,
|
154 |
+
)
|
155 |
+
|
156 |
+
gr.HTML(
|
157 |
+
"""
|
158 |
+
This simple application is an implementation of ChatGPT but over an external dataset.
|
159 |
+
The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
|
160 |
+
The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
|
161 |
+
)
|
162 |
+
with gr.TabItem("Collections manager", id=1):
|
163 |
+
#with gr.Row():
|
164 |
+
#collection_load_list = gr.List(headers=['Collection Loader'],row_count=5, label='Package docs URLs', show_label=True, interactive=True, max_cols=1, max_rows=5)
|
165 |
+
|
166 |
+
with gr.Row():
|
167 |
+
with gr.Column(scale=2):
|
168 |
+
all_collections_to_get = gr.List(headers=['New Collections to make'],row_count=3, label='Collections_to_get', show_label=True, interactive=True, max_cols=1, max_rows=3)
|
169 |
+
make_vs_button = gr.Button(value="Make new collection(s)", variant="secondary").style(full_width=False)
|
170 |
+
with gr.Column(scale=2):
|
171 |
+
collections_viewer = gr.CheckboxGroup(choices=[], label='Collections_viewer', show_label=True)
|
172 |
+
#all_collections_viewer = gr.List(headers=['Existing Collections Viewer'],row_count=7, label='Collections_viewer', show_label=True, max_cols=1)
|
173 |
+
with gr.Column(scale=1):
|
174 |
+
get_vs_button = gr.Button(value="Load collection(s) to chat!", variant="secondary").style(full_width=False)
|
175 |
+
get_all_vs_names_button = gr.Button(value="List all saved collections", variant="secondary").style(full_width=False)
|
176 |
+
delete_vs_button = gr.Button(value="Delete selected saved collections", variant="secondary").style(full_width=False)
|
177 |
+
delete_all_vs_button = gr.Button(value="Delete all saved collections", variant="secondary").style(full_width=False)
|
178 |
+
gr.HTML(
|
179 |
+
"<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
|
180 |
)
|
181 |
+
|
182 |
+
history_state = gr.State()
|
183 |
+
agent_state = gr.State()
|
184 |
+
vs_state = gr.State()
|
185 |
+
all_collections_state = gr.State()
|
186 |
+
chat_state = gr.State()
|
187 |
+
|
188 |
+
submit.click(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
|
189 |
+
message.submit(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
|
190 |
+
|
191 |
+
get_vs_button.click(merge_collections, inputs=[collections_viewer, vs_state], outputs=[vs_state]).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state]).then(change_tab, None, tabs)
|
192 |
+
make_vs_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get], outputs=[all_collections_state], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
|
193 |
+
delete_vs_button.click(delete_vs, inputs=[all_collections_state, collections_viewer], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
|
194 |
+
delete_all_vs_button.click(delete_all_vs, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
|
195 |
+
get_all_vs_names_button.click(list_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
|
196 |
+
|
197 |
+
#I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
|
198 |
+
openai_api_key_textbox.change(
|
199 |
+
set_chain_up,
|
200 |
+
inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
|
201 |
+
outputs=[agent_state],
|
202 |
)
|
203 |
+
model_selector.change(
|
204 |
+
set_chain_up,
|
205 |
+
inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
|
206 |
+
outputs=[agent_state],
|
|
|
|
|
|
|
207 |
)
|
208 |
+
k_textbox.change(
|
209 |
+
set_chain_up,
|
210 |
+
inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
|
211 |
+
outputs=[agent_state],
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
)
|
213 |
+
all_collections_state.value = list_collections(all_collections_state)
|
214 |
+
block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
block.launch(debug=True)
|
ingest.py
CHANGED
@@ -18,57 +18,57 @@ from langchain.docstore.document import Document
|
|
18 |
from langchain.embeddings.base import Embeddings
|
19 |
from chromadb.config import Settings
|
20 |
|
21 |
-
class CachedChroma(Chroma, ABC):
|
22 |
-
|
23 |
-
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
client_settings = Settings(
|
48 |
-
|
49 |
-
|
50 |
-
)
|
51 |
-
client = chromadb.Client(client_settings)
|
52 |
-
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
|
73 |
def get_text(content):
|
74 |
relevant_part = content.find("div", {"class": "markdown"})
|
@@ -77,29 +77,32 @@ def get_text(content):
|
|
77 |
else:
|
78 |
return ""
|
79 |
|
80 |
-
def
|
81 |
-
|
|
|
|
|
82 |
folders=[]
|
83 |
documents = []
|
84 |
shutil.rmtree('downloaded/', ignore_errors=True)
|
85 |
known_exts = ["py", "md"]
|
86 |
-
paths_by_ext = {}
|
87 |
-
docs_by_ext = {}
|
88 |
-
for ext in known_exts + ["other"]:
|
89 |
-
docs_by_ext[ext] = []
|
90 |
-
paths_by_ext[ext] = []
|
91 |
py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
|
92 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
93 |
md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
|
94 |
for url in urls:
|
|
|
|
|
|
|
|
|
|
|
95 |
url = url[0]
|
96 |
if url == '':
|
97 |
continue
|
98 |
if "." in url:
|
|
|
99 |
if len(url) > 1:
|
100 |
-
|
101 |
else:
|
102 |
-
|
103 |
else:
|
104 |
destination = Path('downloaded/'+url)
|
105 |
destination.mkdir(exist_ok=True, parents=True)
|
@@ -135,9 +138,7 @@ def get_docs(urls):
|
|
135 |
if res.returncode == 1:
|
136 |
res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
|
137 |
res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
|
138 |
-
|
139 |
-
|
140 |
-
for folder in folders:
|
141 |
local_repo_path_1 = folder
|
142 |
for root, dirs, files in os.walk(local_repo_path_1):
|
143 |
for file in files:
|
@@ -154,28 +155,31 @@ def get_docs(urls):
|
|
154 |
docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
|
155 |
except Exception as e:
|
156 |
continue
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
#
|
175 |
-
#
|
176 |
-
#
|
|
|
|
|
|
|
177 |
|
178 |
-
return
|
179 |
|
180 |
|
181 |
if __name__ == "__main__":
|
|
|
18 |
from langchain.embeddings.base import Embeddings
|
19 |
from chromadb.config import Settings
|
20 |
|
21 |
+
# class CachedChroma(Chroma, ABC):
|
22 |
+
# """
|
23 |
+
# Wrapper around Chroma to make caching embeddings easier.
|
24 |
|
25 |
+
# It automatically uses a cached version of a specified collection, if available.
|
26 |
+
# Example:
|
27 |
+
# .. code-block:: python
|
28 |
+
# from langchain.vectorstores import Chroma
|
29 |
+
# from langchain.embeddings.openai import OpenAIEmbeddings
|
30 |
+
# embeddings = OpenAIEmbeddings()
|
31 |
+
# vectorstore = CachedChroma.from_documents_with_cache(
|
32 |
+
# ".persisted_data", texts, embeddings, collection_name="fun_experiment"
|
33 |
+
# )
|
34 |
+
# """
|
35 |
|
36 |
+
# @classmethod
|
37 |
+
# def from_documents_with_cache(
|
38 |
+
# cls,
|
39 |
+
# persist_directory: str,
|
40 |
+
# documents: Optional[List[Document]] = None,
|
41 |
+
# embedding: Optional[Embeddings] = None,
|
42 |
+
# ids: Optional[List[str]] = None,
|
43 |
+
# collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
44 |
+
# client_settings: Optional[chromadb.config.Settings] = None,
|
45 |
+
# **kwargs: Any,
|
46 |
+
# ) -> Chroma:
|
47 |
+
# client_settings = Settings(
|
48 |
+
# chroma_db_impl="duckdb+parquet",
|
49 |
+
# persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
|
50 |
+
# )
|
51 |
+
# client = chromadb.Client(client_settings)
|
52 |
+
# collection_names = [c.name for c in client.list_collections()]
|
53 |
|
54 |
+
# if collection_name in collection_names:
|
55 |
+
# return Chroma(
|
56 |
+
# collection_name=collection_name,
|
57 |
+
# embedding_function=embedding,
|
58 |
+
# persist_directory=persist_directory,
|
59 |
+
# client_settings=client_settings,
|
60 |
+
# )
|
61 |
+
# if documents:
|
62 |
+
# return Chroma.from_documents(
|
63 |
+
# documents=documents,
|
64 |
+
# embedding=embedding,
|
65 |
+
# ids=ids,
|
66 |
+
# collection_name=collection_name,
|
67 |
+
# persist_directory=persist_directory,
|
68 |
+
# client_settings=client_settings,
|
69 |
+
# **kwargs
|
70 |
+
# )
|
71 |
+
# raise ValueError("Either documents or collection_name must be specified.")
|
72 |
|
73 |
def get_text(content):
|
74 |
relevant_part = content.find("div", {"class": "markdown"})
|
|
|
77 |
else:
|
78 |
return ""
|
79 |
|
80 |
+
def ingest_docs(all_collections_state, urls):
|
81 |
+
"""Get documents from web pages."""
|
82 |
+
all_docs = []
|
83 |
+
local = False
|
84 |
folders=[]
|
85 |
documents = []
|
86 |
shutil.rmtree('downloaded/', ignore_errors=True)
|
87 |
known_exts = ["py", "md"]
|
|
|
|
|
|
|
|
|
|
|
88 |
py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
|
89 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
90 |
md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
|
91 |
for url in urls:
|
92 |
+
paths_by_ext = {}
|
93 |
+
docs_by_ext = {}
|
94 |
+
for ext in known_exts + ["other"]:
|
95 |
+
docs_by_ext[ext] = []
|
96 |
+
paths_by_ext[ext] = []
|
97 |
url = url[0]
|
98 |
if url == '':
|
99 |
continue
|
100 |
if "." in url:
|
101 |
+
local = True
|
102 |
if len(url) > 1:
|
103 |
+
folder = url.split('.')[1]
|
104 |
else:
|
105 |
+
folder = '.'
|
106 |
else:
|
107 |
destination = Path('downloaded/'+url)
|
108 |
destination.mkdir(exist_ok=True, parents=True)
|
|
|
138 |
if res.returncode == 1:
|
139 |
res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
|
140 |
res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
|
141 |
+
folder = destination
|
|
|
|
|
142 |
local_repo_path_1 = folder
|
143 |
for root, dirs, files in os.walk(local_repo_path_1):
|
144 |
for file in files:
|
|
|
155 |
docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
|
156 |
except Exception as e:
|
157 |
continue
|
158 |
+
for ext in docs_by_ext.keys():
|
159 |
+
if ext == "py":
|
160 |
+
documents += py_splitter.split_documents(docs_by_ext[ext])
|
161 |
+
if ext == "md":
|
162 |
+
documents += md_splitter.split_documents(docs_by_ext[ext])
|
163 |
+
# else:
|
164 |
+
# documents += text_splitter.split_documents(docs_by_ext[ext]
|
165 |
+
all_docs += documents
|
166 |
+
embeddings = HuggingFaceEmbeddings()
|
167 |
+
if 'downloaded/' in folder:
|
168 |
+
folder = '-'.join(folder.split('/')[1:])
|
169 |
+
if folder == '.':
|
170 |
+
folder = 'chat-pykg'
|
171 |
+
vectorstore = Chroma.from_documents(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name=folder)
|
172 |
+
vectorstore.persist()
|
173 |
+
all_collections_state.append(folder)
|
174 |
+
return all_collections_state
|
175 |
+
# embeddings = HuggingFaceEmbeddings()
|
176 |
+
# merged_vectorstore = Chroma.from_documents(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name='merged_collections')
|
177 |
+
# #vectorstore = FAISS.from_documents(documents, embeddings)
|
178 |
+
# # # Save vectorstore
|
179 |
+
# # with open("vectorstore.pkl", "wb") as f:
|
180 |
+
# # pickle.dump(vectorstore. , f)
|
181 |
|
182 |
+
# return merged_vectorstore
|
183 |
|
184 |
|
185 |
if __name__ == "__main__":
|