Spaces:

xangma
/

chat-pykg

Runtime error

App Files Files Community

xangma commited on Apr 11, 2023

Commit

d2a3ff5

•

1 Parent(s): a37484c

latest

Browse files

Files changed (4) hide show

.gitignore +4 -1
app.py +16 -100
chain.py +9 -9
ingest.py +138 -60

.gitignore CHANGED Viewed

	@@ -1 +1,4 @@
1	- ~~pycbc~~/*

+downloaded/*
+__pycache__/*
+launch.json
+.DS_Store

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import gradio as gr
 from abc import ABC
 from typing import List, Optional, Any
 import chromadb
 import langchain
 # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@@ -13,98 +14,13 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTex
 from langchain.document_loaders import TextLoader
 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
-from langchain.vectorstores import Chroma
 from chain import get_new_chain1
 from ingest import ingest_docs
-class CachedChroma(Chroma, ABC):
-    """
-    Wrapper around Chroma to make caching embeddings easier.
-    It automatically uses a cached version of a specified collection, if available.
-        Example:
-            .. code-block:: python
-                    from langchain.vectorstores import Chroma
-                    from langchain.embeddings.openai import OpenAIEmbeddings
-                    embeddings = OpenAIEmbeddings()
-                    vectorstore = CachedChroma.from_documents_with_cache(
-                        ".persisted_data", texts, embeddings, collection_name="fun_experiment"
-                    )
-        """
-    @classmethod
-    def from_documents_with_cache(
-            cls,
-            persist_directory: str,
-            documents: List[Document],
-            embedding: Optional[Embeddings] = None,
-            ids: Optional[List[str]] = None,
-            collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
-            client_settings: Optional[chromadb.config.Settings] = None,
-            **kwargs: Any,
-    ) -> Chroma:
-        settings = chromadb.config.Settings(
-            chroma_db_impl="duckdb+parquet",
-            persist_directory=persist_directory
-        )
-        client = chromadb.Client(settings)
-        collection_names = [c.name for c in client.list_collections()]
-        if collection_name in collection_names:
-            return Chroma(
-                collection_name=collection_name,
-                embedding_function=embedding,
-                persist_directory=persist_directory,
-                client_settings=client_settings,
-            )
-        return Chroma.from_documents(
-            documents=documents,
-            embedding=embedding,
-            ids=ids,
-            collection_name=collection_name,
-            persist_directory=persist_directory,
-            client_settings=client_settings,
-            **kwargs
-        )
-# def get_docs():
-#     local_repo_path_1 = "pycbc/"
-#     loaders = []
-#     docs = []
-#     for root, dirs, files in os.walk(local_repo_path_1):
-#         for file in files:
-#             file_path = os.path.join(root, file)
-#             rel_file_path = os.path.relpath(file_path, local_repo_path_1)
-#             # Filter by file extension
-#             if any(rel_file_path.endswith(ext) for ext in [".py", ".sh"]):
-#                 # Filter by directory
-#                 if any(rel_file_path.startswith(d) for d in ["pycbc/", "examples/"]):
-#                     docs.append(rel_file_path)
-#             if any(rel_file_path.startswith(d) for d in ["bin/"]):
-#                 docs.append(rel_file_path)
-#     loaders.extend([TextLoader(os.path.join(local_repo_path_1, doc)).load() for doc in docs])
-#     py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
-#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-#     documents = []
-#     for load in loaders:
-#         try:
-#             if load[0].metadata['source'][-3:] == ".py" == "" or "pycbc/bin/" in load[0].metadata['source']:
-#                 documents.extend(py_splitter.split_documents(load))
-#         except Exception as e:
-#             documents.extend(text_splitter.split_documents(load))
-#     return documents
 def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
-    # # set defaults
-    # if not model_selector:
-    #     model_selector = "gpt-3.5-turbo"
-    # if not k_textbox:
-    #     k_textbox = 10
-    # else:
-    #     k_textbox = int(k_textbox)
     if type(vectorstore) != list:
         if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
             if openai_api_key:
@@ -196,20 +112,20 @@ with block:
     submit_urls.click(get_vectorstore, inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, vs_state, agent_state], outputs=[vs_state, agent_state])
     # I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
-    openai_api_key_textbox.change(
-        set_chain_up,
-        inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
-        outputs=[agent_state],
-    )
-    model_selector.change(
-        set_chain_up,
-        inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
-        outputs=[agent_state],
-    )
-    k_textbox.change(
-        set_chain_up,
-        inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
-        outputs=[agent_state],
-    )
 block.launch(debug=True)

 import gradio as gr
 from abc import ABC
 from typing import List, Optional, Any
+import asyncio
 import chromadb
 import langchain
 # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 from langchain.document_loaders import TextLoader
 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
 from chain import get_new_chain1
 from ingest import ingest_docs
 def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
     if type(vectorstore) != list:
         if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
             if openai_api_key:
     submit_urls.click(get_vectorstore, inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, vs_state, agent_state], outputs=[vs_state, agent_state])
     # I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
+    # openai_api_key_textbox.change(
+    #     set_chain_up,
+    #     inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
+    #     outputs=[agent_state],
+    # )
+    # model_selector.change(
+    #     set_chain_up,
+    #     inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
+    #     outputs=[agent_state],
+    # )
+    # k_textbox.change(
+    #     set_chain_up,
+    #     inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
+    #     outputs=[agent_state],
+    # )
 block.launch(debug=True)

chain.py CHANGED Viewed

@@ -42,15 +42,15 @@ def get_new_chain1(vectorstore, model_selector, k_textbox) -> Chain:
     Standalone question:"""
     template = """You are an AI assistant for various open source libraries.
-You are given the following extracted parts of a long document and a question. Provide a conversational answer to the question.
-You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
-If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
-If the question is not about the package documentation, politely inform them that you are tuned to only answer questions about the package documentationz.
-Question: {question}
-=========
-{context}
-=========
-Answer in Markdown:"""
     # Construct a ChatVectorDBChain with a streaming llm for combine docs
     # and a separate, non-streaming llm for question generation

     Standalone question:"""
     template = """You are an AI assistant for various open source libraries.
+    You are given the following extracted parts of a long document and a question. Provide a conversational answer to the question.
+    You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
+    If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
+    If the question is not about the package documentation, politely inform them that you are tuned to only answer questions about the package documentationz.
+    Question: {question}
+    =========
+    {context}
+    =========
+    Answer in Markdown:"""
     # Construct a ChatVectorDBChain with a streaming llm for combine docs
     # and a separate, non-streaming llm for question generation

ingest.py CHANGED Viewed

@@ -1,13 +1,73 @@
 import pickle
 from langchain.document_loaders import SitemapLoader, ReadTheDocsLoader, TextLoader
 from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, PythonCodeTextSplitter, MarkdownTextSplitter
 from langchain.vectorstores.faiss import FAISS
 import itertools
 import os
-import fsspec
 from pathlib import Path
 def get_text(content):
     relevant_part = content.find("div", {"class": "markdown"})
@@ -18,74 +78,92 @@ def get_text(content):
 def ingest_docs(urls=[]):
     """Get documents from web pages."""
     folders=[]
-    documents = []
     for url in urls:
-        try:
-            if "local:" in url:
-                folders.append(url.split('local:')[1])
-            else:
-                url = url[0]
-                if url[0] == '/':
-                    url = url[1:]
-                if url[-1] != '/':
-                    url += '/'
-                org = url.split('/')[0]
-                repo = url.split('/')[1]
-                # join all strings after 2nd slash
-                folder = '/'.join(url.split('/')[2:])
-                if folder[-1] != '/':
-                    folder += '/'
-                fs = fsspec.filesystem("github", org=org, repo=repo)
-                # recursive copy
-                destination = url
-                destination.mkdir(exist_ok=True, parents=True)
-                fs.get(fs.ls(folder), destination.as_posix(), recursive=True)
-                folders.append(destination)
-        except Exception as e:
-            print(e)
-    for folder in folders:
-        try:
-            py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
-            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-            md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
-            local_repo_path_1 = folder
-            known_exts = [".py", ".md", ".rst"]
-            paths_by_ext = {}
-            docs_by_ext = {}
-            for ext in known_exts + ["other"]:
-                docs_by_ext[ext] = []
-                paths_by_ext[ext] = []
-            for root, dirs, files in os.walk(local_repo_path_1):
-                for file in files:
-                    file_path = os.path.join(root, file)
-                    rel_file_path = os.path.relpath(file_path, local_repo_path_1)
-                    for ext in paths_by_ext.keys():
-                        if '.' not in [i[0] for i in rel_file_path.split('/')]:
-                            if rel_file_path.endswith(ext):
-                                paths_by_ext[ext].append(rel_file_path)
-                                docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load())
-                            else:
-                                paths_by_ext["other"].append(rel_file_path)
-                                docs_by_ext["other"].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load())
-            for ext in docs_by_ext.keys():
-                if ext == ".py":
-                    documents += py_splitter.split_documents(docs_by_ext[ext])
-                elif ext == ".md" or ext == ".rst":
-                    documents += md_splitter.split_documents(docs_by_ext[ext])
-                else:
-                    documents += text_splitter.split_documents(docs_by_ext[ext])
-        except Exception as e:
-            print(e)
-            continue
     embeddings = HuggingFaceEmbeddings()
     vectorstore = FAISS.from_documents(documents, embeddings)
     # Save vectorstore
     with open("vectorstore.pkl", "wb") as f:
         pickle.dump(vectorstore, f)
     return vectorstore
 if __name__ == "__main__":
     ingest_docs()

 import pickle
+import tempfile
 from langchain.document_loaders import SitemapLoader, ReadTheDocsLoader, TextLoader
 from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, PythonCodeTextSplitter, MarkdownTextSplitter
 from langchain.vectorstores.faiss import FAISS
 import itertools
 import os
+from langchain.vectorstores import Chroma
+import shutil
 from pathlib import Path
+import subprocess
+from git import Repo, Git
+import tarfile
+import chromadb
+from abc import ABC
+from typing import List, Optional, Any
+from langchain.docstore.document import Document
+from langchain.embeddings.base import Embeddings
+class CachedChroma(Chroma, ABC):
+    """
+    Wrapper around Chroma to make caching embeddings easier.
+    It automatically uses a cached version of a specified collection, if available.
+        Example:
+            .. code-block:: python
+                    from langchain.vectorstores import Chroma
+                    from langchain.embeddings.openai import OpenAIEmbeddings
+                    embeddings = OpenAIEmbeddings()
+                    vectorstore = CachedChroma.from_documents_with_cache(
+                        ".persisted_data", texts, embeddings, collection_name="fun_experiment"
+                    )
+        """
+    @classmethod
+    def from_documents_with_cache(
+            cls,
+            persist_directory: str,
+            documents: List[Document],
+            embedding: Optional[Embeddings] = None,
+            ids: Optional[List[str]] = None,
+            collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
+            client_settings: Optional[chromadb.config.Settings] = None,
+            **kwargs: Any,
+    ) -> Chroma:
+        settings = chromadb.config.Settings(
+            chroma_db_impl="duckdb+parquet",
+            persist_directory=persist_directory
+        )
+        client = chromadb.Client(settings)
+        collection_names = [c.name for c in client.list_collections()]
+        if collection_name in collection_names:
+            return Chroma(
+                collection_name=collection_name,
+                embedding_function=embedding,
+                persist_directory=persist_directory,
+                client_settings=client_settings,
+            )
+        return Chroma.from_documents(
+            documents=documents,
+            embedding=embedding,
+            ids=ids,
+            collection_name=collection_name,
+            persist_directory=persist_directory,
+            client_settings=client_settings,
+            **kwargs
+        )
 def get_text(content):
     relevant_part = content.find("div", {"class": "markdown"})
 def ingest_docs(urls=[]):
     """Get documents from web pages."""
+    cwd = os.getcwd()
     folders=[]
+    documents = []
+    shutil.rmtree('downloaded/', ignore_errors=True)
+    known_exts = ["py", "md"]
+    paths_by_ext = {}
+    docs_by_ext = {}
+    for ext in known_exts + ["other"]:
+        docs_by_ext[ext] = []
+        paths_by_ext[ext] = []
+    py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
     for url in urls:
+        url = url[0]
+        if "local:" in url:
+            folders.append(url.split('local:')[1])
+        else:
+            destination = Path('downloaded/'+url)
+            destination.mkdir(exist_ok=True, parents=True)
+            destination = destination.as_posix()
+            if url[0] == '/':
+                url = url[1:]
+            org = url.split('/')[0]
+            repo = url.split('/')[1]
+            repo_url = f"https://github.com/{org}/{repo}.git"
+            # join all strings after 2nd slash
+            folder = '/'.join(url.split('/')[2:])
+            if folder[-1] == '/':
+                folder = folder[:-1]
+            if folder:
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    temp_path = Path(temp_dir)
+                    # Initialize the Git repository
+                    subprocess.run(["git", "init"], cwd=temp_path)
+                    # Add the remote repository
+                    subprocess.run(["git", "remote", "add", "-f", "origin", repo_url], cwd=temp_path)
+                    # Enable sparse-checkout
+                    subprocess.run(["git", "config", "core.sparseCheckout", "true"], cwd=temp_path)
+                    # Specify the folder to checkout
+                    with open(temp_path / ".git" / "info" / "sparse-checkout", "w") as f:
+                        f.write(f"{folder}/\n")
+                    # Checkout the desired branch
+                    res = subprocess.run(["git", "checkout", 'main'], cwd=temp_path)
+                    if res.returncode == 1:
+                        res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
+                    res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
+                    folders.append(destination)
+    for folder in folders:
+        local_repo_path_1 = folder
+        for root, dirs, files in os.walk(local_repo_path_1):
+            for file in files:
+                file_path = os.path.join(root, file)
+                rel_file_path = os.path.relpath(file_path, local_repo_path_1)
+                ext = rel_file_path.split('.')[-1]
+                try:
+                    if '.' not in [i[0] for i in rel_file_path.split('/')]:
+                        if paths_by_ext.get(rel_file_path.split('.')[-1]) is None:
+                            paths_by_ext["other"].append(rel_file_path)
+                            docs_by_ext["other"].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
+                        else:
+                            paths_by_ext[ext].append(rel_file_path)
+                            docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
+                except Exception as e:
+                    continue
+    for ext in docs_by_ext.keys():
+        if ext == "py":
+            documents += py_splitter.split_documents(docs_by_ext[ext])
+        if ext == "md":
+            documents += md_splitter.split_documents(docs_by_ext[ext])
+        # else:
+        #     documents += text_splitter.split_documents(docs_by_ext[ext]
     embeddings = HuggingFaceEmbeddings()
     vectorstore = FAISS.from_documents(documents, embeddings)
+    # vectorstore = CachedChroma.from_documents_with_cache(".persisted_data", documents, embeddings)
     # Save vectorstore
     with open("vectorstore.pkl", "wb") as f:
         pickle.dump(vectorstore, f)
     return vectorstore
 if __name__ == "__main__":
     ingest_docs()