Spaces:

xangma
/

chat-pykg

Runtime error

App Files Files Community

xangma commited on Apr 12, 2023

Commit

a835cf0

1 Parent(s): 969f5dc

refactor

Browse files

Files changed (3) hide show

.gitignore +1 -0
app.py +129 -108
ingest.py +85 -81

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .persisted_data/*
 downloaded/*
 __pycache__/*

+.chroma/*
 .persisted_data/*
 downloaded/*
 __pycache__/*

app.py CHANGED Viewed

@@ -18,12 +18,36 @@ from langchain.embeddings.base import Embeddings
 import shutil
 import random, string
 from chain import get_new_chain1
-from ingest import ingest_docs, CachedChroma
 def randomword(length):
     letters = string.ascii_lowercase
     return ''.join(random.choice(letters) for i in range(length))
 def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
     if vectorstore == None:
         return 'no_vectorstore'
@@ -40,34 +64,31 @@ def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
             qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
             return qa_chain
-def get_vectorstore(chat_state, collection_textbox, vs_state):
-    embeddings = HuggingFaceEmbeddings()
-    vectorstore = CachedChroma.from_documents_with_cache(persist_directory=".persisted_data", documents=None, embedding = embeddings, collection_name=collection_textbox)
-    return vectorstore
-def make_vectorstore(chat_state,collection_name, packagedocslist, vs_state):
-    vectorstore = ingest_docs(collection_name, packagedocslist)
-    return vectorstore
-def delete_vs(chat_state, collection_textbox):
     client = chromadb.Client(Settings(
         chroma_db_impl="duckdb+parquet",
         persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
     ))
-    client.delete_collection(collection_textbox)
-def delete_all_vs(chat_state):
     shutil.rmtree(".persisted_data")
-    return "all_vs_deleted"
-def get_all_vs_names(chat_state):
     client = chromadb.Client(Settings(
         chroma_db_impl="duckdb+parquet",
         persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
     ))
-    collection_names = [c.name for c in client.list_collections()]
-    # print the collection names to the chatbot
-    return collection_names, "all_collections"
 def chat(inp, history, agent):
     history = history or []
@@ -97,98 +118,98 @@ def chat(inp, history, agent):
 block = gr.Blocks(css=".gradio-container {background-color: system;}")
 with block:
-    with gr.Row():
-        gr.Markdown("<h3><center>Package docs Assistant</center></h3>")
-        openai_api_key_textbox = gr.Textbox(
-            placeholder="Paste your OpenAI API key (sk-...)",
-            show_label=False,
-            lines=1,
-            type="password",
         )
-        model_selector = gr.Dropdown(["gpt-3.5-turbo", "gpt-4", "other"], label="Model", show_label=True)
-        model_selector.value = "gpt-3.5-turbo"
-        k_textbox = gr.Textbox(
-            placeholder="k: Number of search results to consider",
-            label="Search Results k:",
-            show_label=True,
-            lines=1,
         )
-        k_textbox.value = "10"
-    chatbot = gr.Chatbot()
-    with gr.Row():
-        message = gr.Textbox(
-            label="What's your question?",
-            placeholder="What is this code?",
-            lines=1,
         )
-        submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
-    with gr.Row():
-        with gr.Column(scale=4):
-            packagedocslist = gr.List(headers=['Package Docs URL'],row_count=5, label='Package docs URLs', show_label=True, interactive=True, max_cols=1, max_rows=5)
-        with gr.Column(scale=1):
-            randomname = randomword(5)
-            collection_textbox = gr.Textbox(placeholder=randomname,
-            label="Collection name:",
-            show_label=True,
-            lines=1,
         )
-            collection_textbox.value = randomname
-            get_vs_button = gr.Button(value="Get vectorstore", variant="secondary").style(full_width=False)
-            make_vs_button = gr.Button(value="Make vectorstore", variant="secondary").style(full_width=False)
-            delete_vs_button = gr.Button(value="Delete vectorstore", variant="secondary").style(full_width=False)
-            delete_all_vs_button = gr.Button(value="Delete all vectorstores", variant="secondary").style(full_width=False)
-            get_all_vs_names_button = gr.Button(value="Get all vectorstore names", variant="secondary").style(full_width=False)
-    gr.Examples(
-        examples=[
-            "What is this code and why hasn't the developer documented it?",
-            "Where is this specific method in the source code and why is it broken?"
-        ],
-        inputs=message,
-    )
-    gr.HTML(
-        """
-    This simple application is an implementation of ChatGPT but over an external dataset.
-    The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
-    The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
-    )
-    gr.HTML(
-        "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
-    )
-    history_state = gr.State()
-    agent_state = gr.State()
-    vs_state = gr.State()
-    all_collections = gr.State()
-    chat_state = gr.State()
-    submit.click(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
-    message.submit(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
-    get_vs_button.click(get_vectorstore, inputs=[chat_state,collection_textbox, vs_state], outputs=[vs_state]).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state])
-    make_vs_button.click(make_vectorstore, inputs=[chat_state,collection_textbox, packagedocslist, vs_state], outputs=[vs_state], show_progress=True).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state])
-    delete_vs_button.click(delete_vs, inputs=[chat_state,collection_textbox], outputs=[])
-    delete_all_vs_button.click(delete_all_vs, inputs=[chat_state], outputs=[chat_state]).then(chat, inputs=[all_collections, history_state, chat_state], outputs=[chatbot, history_state])
-    get_all_vs_names_button.click(get_all_vs_names, inputs=[chat_state], outputs=[all_collections, chat_state]).then(chat, inputs=[all_collections, history_state, chat_state], outputs=[chatbot, history_state])
-    #I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
-    openai_api_key_textbox.change(
-        set_chain_up,
-        inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
-        outputs=[agent_state],
-    )
-    model_selector.change(
-        set_chain_up,
-        inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
-        outputs=[agent_state],
-    )
-    k_textbox.change(
-        set_chain_up,
-        inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
-        outputs=[agent_state],
-    )
 block.launch(debug=True)

 import shutil
 import random, string
 from chain import get_new_chain1
+from ingest import ingest_docs
 def randomword(length):
     letters = string.ascii_lowercase
     return ''.join(random.choice(letters) for i in range(length))
+def change_tab():
+    return gr.Tabs.update(selected=0)
+def merge_collections(collection_load_names, vs_state):
+    merged_documents = []
+    merged_embeddings = []
+    client = chromadb.Client(Settings(
+        chroma_db_impl="duckdb+parquet",
+        persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
+    ))
+    for collection_name in collection_load_names:
+        collection_name = collection_name
+        if collection_name == '':
+            continue
+        collection = client.get_collection(collection_name)
+        collection = collection.get(include=["metadatas", "documents", "embeddings"])
+        for i in range(len(collection['documents'])):
+            merged_documents.append(Document(page_content=collection['documents'][i], metadata = collection['metadatas'][i]))
+            merged_embeddings.append(collection['embeddings'][i])
+    merged_collection_name = "merged_collection"
+    merged_vectorstore = Chroma.from_documents(documents=merged_documents, embeddings=merged_embeddings, collection_name=merged_collection_name)
+    return merged_vectorstore
 def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
     if vectorstore == None:
         return 'no_vectorstore'
             qa_chain = get_new_chain1(vectorstore, model_selector, k_textbox)
             return qa_chain
+def delete_vs(all_collections_state, collections_viewer):
     client = chromadb.Client(Settings(
         chroma_db_impl="duckdb+parquet",
         persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
     ))
+    for collection in collections_viewer:
+        client.delete_collection(collection)
+        all_collections_state.remove(collection)
+    return all_collections_state
+def delete_all_vs(all_collections_state):
     shutil.rmtree(".persisted_data")
+    return []
+def list_collections(all_collections_state):
     client = chromadb.Client(Settings(
         chroma_db_impl="duckdb+parquet",
         persist_directory=".persisted_data" # Optional, defaults to .chromadb/ in the current directory
     ))
+    collection_names = [[c.name][0] for c in client.list_collections()]
+    return collection_names
+def update_checkboxgroup(all_collections_state):
+    new_options = [i for i in all_collections_state]
+    return gr.CheckboxGroup.update(choices=new_options)
 def chat(inp, history, agent):
     history = history or []
 block = gr.Blocks(css=".gradio-container {background-color: system;}")
 with block:
+    gr.Markdown("<h3><center>chat-pykg</center></h3>")
+    with gr.Tabs() as tabs:
+        with gr.TabItem("Chat", id=0):
+            with gr.Row():
+                openai_api_key_textbox = gr.Textbox(
+                    placeholder="Paste your OpenAI API key (sk-...)",
+                    show_label=False,
+                    lines=1,
+                    type="password",
+                )
+                model_selector = gr.Dropdown(["gpt-3.5-turbo", "gpt-4", "other"], label="Model", show_label=True)
+                model_selector.value = "gpt-3.5-turbo"
+                k_textbox = gr.Textbox(
+                    placeholder="k: Number of search results to consider",
+                    label="Search Results k:",
+                    show_label=True,
+                    lines=1,
+                )
+                k_textbox.value = "10"
+            chatbot = gr.Chatbot()
+            with gr.Row():
+                message = gr.Textbox(
+                    label="What's your question?",
+                    placeholder="What is this code?",
+                    lines=1,
+                )
+                submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
+            gr.Examples(
+                examples=[
+                    "What is this code and why hasn't the developer documented it?",
+                    "Where is this specific method in the source code and why is it broken?"
+                ],
+                inputs=message,
+            )
+            gr.HTML(
+                """
+            This simple application is an implementation of ChatGPT but over an external dataset.
+            The source code is split/broken down into many document objects using langchain's pythoncodetextsplitter, which apparently tries to keep whole functions etc. together. This means that each file in the source code is split into many smaller documents, and the k value is the number of documents to consider when searching for the most similar documents to the question. With gpt-3.5-turbo, k=10 seems to work well, but with gpt-4, k=20 seems to work better.
+            The model's memory is set to 5 messages, but I haven't tested with gpt-3.5-turbo yet to see if it works well. It seems to work well with gpt-4."""
+            )
+        with gr.TabItem("Collections manager", id=1):
+            #with gr.Row():
+                    #collection_load_list = gr.List(headers=['Collection Loader'],row_count=5, label='Package docs URLs', show_label=True, interactive=True, max_cols=1, max_rows=5)
+            with gr.Row():
+                with gr.Column(scale=2):
+                    all_collections_to_get = gr.List(headers=['New Collections to make'],row_count=3, label='Collections_to_get', show_label=True, interactive=True, max_cols=1, max_rows=3)
+                    make_vs_button = gr.Button(value="Make new collection(s)", variant="secondary").style(full_width=False)
+                with gr.Column(scale=2):
+                    collections_viewer = gr.CheckboxGroup(choices=[], label='Collections_viewer', show_label=True)
+                    #all_collections_viewer = gr.List(headers=['Existing Collections Viewer'],row_count=7, label='Collections_viewer', show_label=True, max_cols=1)
+                with gr.Column(scale=1):
+                    get_vs_button = gr.Button(value="Load collection(s) to chat!", variant="secondary").style(full_width=False)
+                    get_all_vs_names_button = gr.Button(value="List all saved collections", variant="secondary").style(full_width=False)
+                    delete_vs_button = gr.Button(value="Delete selected saved collections", variant="secondary").style(full_width=False)
+                    delete_all_vs_button = gr.Button(value="Delete all saved collections", variant="secondary").style(full_width=False)
+        gr.HTML(
+            "<center>Powered by <a href='https://github.com/hwchase17/langchain'>LangChain 🦜️🔗</a></center>"
         )
+        history_state = gr.State()
+        agent_state = gr.State()
+        vs_state = gr.State()
+        all_collections_state = gr.State()
+        chat_state = gr.State()
+        submit.click(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
+        message.submit(chat, inputs=[message, history_state, agent_state], outputs=[chatbot, history_state])
+        get_vs_button.click(merge_collections, inputs=[collections_viewer, vs_state], outputs=[vs_state]).then(set_chain_up, inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state], outputs=[agent_state]).then(change_tab, None, tabs)
+        make_vs_button.click(ingest_docs, inputs=[all_collections_state, all_collections_to_get], outputs=[all_collections_state], show_progress=True).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
+        delete_vs_button.click(delete_vs, inputs=[all_collections_state, collections_viewer], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
+        delete_all_vs_button.click(delete_all_vs, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
+        get_all_vs_names_button.click(list_collections, inputs=[all_collections_state], outputs=[all_collections_state]).then(update_checkboxgroup, inputs = [all_collections_state], outputs = [collections_viewer])
+        #I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
+        openai_api_key_textbox.change(
+            set_chain_up,
+            inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
+            outputs=[agent_state],
         )
+        model_selector.change(
+            set_chain_up,
+            inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
+            outputs=[agent_state],
         )
+        k_textbox.change(
+            set_chain_up,
+            inputs=[openai_api_key_textbox, model_selector, k_textbox, vs_state, agent_state],
+            outputs=[agent_state],
         )
+        all_collections_state.value = list_collections(all_collections_state)
+        block.load(update_checkboxgroup, inputs = all_collections_state, outputs = collections_viewer)
 block.launch(debug=True)

ingest.py CHANGED Viewed

@@ -18,57 +18,57 @@ from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
 from chromadb.config import Settings
-class CachedChroma(Chroma, ABC):
-    """
-    Wrapper around Chroma to make caching embeddings easier.
-    It automatically uses a cached version of a specified collection, if available.
-        Example:
-            .. code-block:: python
-                    from langchain.vectorstores import Chroma
-                    from langchain.embeddings.openai import OpenAIEmbeddings
-                    embeddings = OpenAIEmbeddings()
-                    vectorstore = CachedChroma.from_documents_with_cache(
-                        ".persisted_data", texts, embeddings, collection_name="fun_experiment"
-                    )
-        """
-    @classmethod
-    def from_documents_with_cache(
-            cls,
-            persist_directory: str,
-            documents: Optional[List[Document]] = None,
-            embedding: Optional[Embeddings] = None,
-            ids: Optional[List[str]] = None,
-            collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
-            client_settings: Optional[chromadb.config.Settings] = None,
-            **kwargs: Any,
-    ) -> Chroma:
-        client_settings = Settings(
-            chroma_db_impl="duckdb+parquet",
-            persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
-        )
-        client = chromadb.Client(client_settings)
-        collection_names = [c.name for c in client.list_collections()]
-        if collection_name in collection_names:
-            return Chroma(
-                collection_name=collection_name,
-                embedding_function=embedding,
-                persist_directory=persist_directory,
-                client_settings=client_settings,
-            )
-        if documents:
-            return Chroma.from_documents(
-                documents=documents,
-                embedding=embedding,
-                ids=ids,
-                collection_name=collection_name,
-                persist_directory=persist_directory,
-                client_settings=client_settings,
-                **kwargs
-            )
-        raise ValueError("Either documents or collection_name must be specified.")
 def get_text(content):
     relevant_part = content.find("div", {"class": "markdown"})
@@ -77,29 +77,32 @@ def get_text(content):
     else:
         return ""
-def get_docs(urls):
-    cwd = os.getcwd()
     folders=[]
     documents = []
     shutil.rmtree('downloaded/', ignore_errors=True)
     known_exts = ["py", "md"]
-    paths_by_ext = {}
-    docs_by_ext = {}
-    for ext in known_exts + ["other"]:
-        docs_by_ext[ext] = []
-        paths_by_ext[ext] = []
     py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
     for url in urls:
         url = url[0]
         if url == '':
             continue
         if "." in url:
             if len(url) > 1:
-                folders.append(url.split('.')[1])
             else:
-                folders.append('.')
         else:
             destination = Path('downloaded/'+url)
             destination.mkdir(exist_ok=True, parents=True)
@@ -135,9 +138,7 @@ def get_docs(urls):
                     if res.returncode == 1:
                         res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
                     res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
-                    folders.append(destination)
-    for folder in folders:
         local_repo_path_1 = folder
         for root, dirs, files in os.walk(local_repo_path_1):
             for file in files:
@@ -154,28 +155,31 @@ def get_docs(urls):
                             docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
                 except Exception as e:
                     continue
-    for ext in docs_by_ext.keys():
-        if ext == "py":
-            documents += py_splitter.split_documents(docs_by_ext[ext])
-        if ext == "md":
-            documents += md_splitter.split_documents(docs_by_ext[ext])
-        # else:
-        #     documents += text_splitter.split_documents(docs_by_ext[ext]
-    return documents
-def ingest_docs(collection_name, urls=[]):
-    """Get documents from web pages."""
-    documents = get_docs(urls)
-    embeddings = HuggingFaceEmbeddings()
-    vectorstore = CachedChroma.from_documents_with_cache(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name=collection_name)
-    vectorstore.persist()
-    #vectorstore = FAISS.from_documents(documents, embeddings)
-    # # Save vectorstore
-    # with open("vectorstore.pkl", "wb") as f:
-    #     pickle.dump(vectorstore. , f)
-    return vectorstore
 if __name__ == "__main__":

 from langchain.embeddings.base import Embeddings
 from chromadb.config import Settings
+# class CachedChroma(Chroma, ABC):
+#     """
+#     Wrapper around Chroma to make caching embeddings easier.
+#     It automatically uses a cached version of a specified collection, if available.
+#         Example:
+#             .. code-block:: python
+#                     from langchain.vectorstores import Chroma
+#                     from langchain.embeddings.openai import OpenAIEmbeddings
+#                     embeddings = OpenAIEmbeddings()
+#                     vectorstore = CachedChroma.from_documents_with_cache(
+#                         ".persisted_data", texts, embeddings, collection_name="fun_experiment"
+#                     )
+#         """
+#     @classmethod
+#     def from_documents_with_cache(
+#             cls,
+#             persist_directory: str,
+#             documents: Optional[List[Document]] = None,
+#             embedding: Optional[Embeddings] = None,
+#             ids: Optional[List[str]] = None,
+#             collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
+#             client_settings: Optional[chromadb.config.Settings] = None,
+#             **kwargs: Any,
+#     ) -> Chroma:
+        # client_settings = Settings(
+        #     chroma_db_impl="duckdb+parquet",
+        #     persist_directory=persist_directory # Optional, defaults to .chromadb/ in the current directory
+        # )
+        # client = chromadb.Client(client_settings)
+#         collection_names = [c.name for c in client.list_collections()]
+#         if collection_name in collection_names:
+#             return Chroma(
+#                 collection_name=collection_name,
+#                 embedding_function=embedding,
+#                 persist_directory=persist_directory,
+#                 client_settings=client_settings,
+#             )
+#         if documents:
+#             return Chroma.from_documents(
+#                 documents=documents,
+#                 embedding=embedding,
+#                 ids=ids,
+#                 collection_name=collection_name,
+#                 persist_directory=persist_directory,
+#                 client_settings=client_settings,
+#                 **kwargs
+#             )
+#         raise ValueError("Either documents or collection_name must be specified.")
 def get_text(content):
     relevant_part = content.find("div", {"class": "markdown"})
     else:
         return ""
+def ingest_docs(all_collections_state, urls):
+    """Get documents from web pages."""
+    all_docs = []
+    local = False
     folders=[]
     documents = []
     shutil.rmtree('downloaded/', ignore_errors=True)
     known_exts = ["py", "md"]
     py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
     for url in urls:
+        paths_by_ext = {}
+        docs_by_ext = {}
+        for ext in known_exts + ["other"]:
+            docs_by_ext[ext] = []
+            paths_by_ext[ext] = []
         url = url[0]
         if url == '':
             continue
         if "." in url:
+            local = True
             if len(url) > 1:
+                folder = url.split('.')[1]
             else:
+                folder = '.'
         else:
             destination = Path('downloaded/'+url)
             destination.mkdir(exist_ok=True, parents=True)
                     if res.returncode == 1:
                         res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
                     res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
+                    folder = destination
         local_repo_path_1 = folder
         for root, dirs, files in os.walk(local_repo_path_1):
             for file in files:
                             docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
                 except Exception as e:
                     continue
+        for ext in docs_by_ext.keys():
+            if ext == "py":
+                documents += py_splitter.split_documents(docs_by_ext[ext])
+            if ext == "md":
+                documents += md_splitter.split_documents(docs_by_ext[ext])
+            # else:
+            #     documents += text_splitter.split_documents(docs_by_ext[ext]
+        all_docs += documents
+        embeddings = HuggingFaceEmbeddings()
+        if 'downloaded/' in folder:
+            folder = '-'.join(folder.split('/')[1:])
+        if folder == '.':
+            folder = 'chat-pykg'
+        vectorstore = Chroma.from_documents(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name=folder)
+        vectorstore.persist()
+        all_collections_state.append(folder)
+    return all_collections_state
+    # embeddings = HuggingFaceEmbeddings()
+    # merged_vectorstore = Chroma.from_documents(persist_directory=".persisted_data", documents=documents, embedding=embeddings, collection_name='merged_collections')
+    # #vectorstore = FAISS.from_documents(documents, embeddings)
+    # # # Save vectorstore
+    # # with open("vectorstore.pkl", "wb") as f:
+    # #     pickle.dump(vectorstore. , f)
+    # return merged_vectorstore
 if __name__ == "__main__":