xangma commited on
Commit
d2a3ff5
1 Parent(s): a37484c
Files changed (4) hide show
  1. .gitignore +4 -1
  2. app.py +16 -100
  3. chain.py +9 -9
  4. ingest.py +138 -60
.gitignore CHANGED
@@ -1 +1,4 @@
1
- pycbc/*
 
 
 
 
1
+ downloaded/*
2
+ __pycache__/*
3
+ launch.json
4
+ .DS_Store
app.py CHANGED
@@ -3,6 +3,7 @@ import os
3
  import gradio as gr
4
  from abc import ABC
5
  from typing import List, Optional, Any
 
6
  import chromadb
7
  import langchain
8
  # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@@ -13,98 +14,13 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTex
13
  from langchain.document_loaders import TextLoader
14
  from langchain.docstore.document import Document
15
  from langchain.embeddings.base import Embeddings
16
- from langchain.vectorstores import Chroma
17
 
18
  from chain import get_new_chain1
19
  from ingest import ingest_docs
20
 
21
- class CachedChroma(Chroma, ABC):
22
- """
23
- Wrapper around Chroma to make caching embeddings easier.
24
-
25
- It automatically uses a cached version of a specified collection, if available.
26
- Example:
27
- .. code-block:: python
28
- from langchain.vectorstores import Chroma
29
- from langchain.embeddings.openai import OpenAIEmbeddings
30
- embeddings = OpenAIEmbeddings()
31
- vectorstore = CachedChroma.from_documents_with_cache(
32
- ".persisted_data", texts, embeddings, collection_name="fun_experiment"
33
- )
34
- """
35
-
36
- @classmethod
37
- def from_documents_with_cache(
38
- cls,
39
- persist_directory: str,
40
- documents: List[Document],
41
- embedding: Optional[Embeddings] = None,
42
- ids: Optional[List[str]] = None,
43
- collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
44
- client_settings: Optional[chromadb.config.Settings] = None,
45
- **kwargs: Any,
46
- ) -> Chroma:
47
- settings = chromadb.config.Settings(
48
- chroma_db_impl="duckdb+parquet",
49
- persist_directory=persist_directory
50
- )
51
- client = chromadb.Client(settings)
52
- collection_names = [c.name for c in client.list_collections()]
53
-
54
- if collection_name in collection_names:
55
- return Chroma(
56
- collection_name=collection_name,
57
- embedding_function=embedding,
58
- persist_directory=persist_directory,
59
- client_settings=client_settings,
60
- )
61
-
62
- return Chroma.from_documents(
63
- documents=documents,
64
- embedding=embedding,
65
- ids=ids,
66
- collection_name=collection_name,
67
- persist_directory=persist_directory,
68
- client_settings=client_settings,
69
- **kwargs
70
- )
71
 
72
- # def get_docs():
73
- # local_repo_path_1 = "pycbc/"
74
- # loaders = []
75
- # docs = []
76
- # for root, dirs, files in os.walk(local_repo_path_1):
77
- # for file in files:
78
- # file_path = os.path.join(root, file)
79
- # rel_file_path = os.path.relpath(file_path, local_repo_path_1)
80
- # # Filter by file extension
81
- # if any(rel_file_path.endswith(ext) for ext in [".py", ".sh"]):
82
- # # Filter by directory
83
- # if any(rel_file_path.startswith(d) for d in ["pycbc/", "examples/"]):
84
- # docs.append(rel_file_path)
85
- # if any(rel_file_path.startswith(d) for d in ["bin/"]):
86
- # docs.append(rel_file_path)
87
- # loaders.extend([TextLoader(os.path.join(local_repo_path_1, doc)).load() for doc in docs])
88
- # py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
89
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
90
- # documents = []
91
- # for load in loaders:
92
- # try:
93
- # if load[0].metadata['source'][-3:] == ".py" == "" or "pycbc/bin/" in load[0].metadata['source']:
94
- # documents.extend(py_splitter.split_documents(load))
95
- # except Exception as e:
96
- # documents.extend(text_splitter.split_documents(load))
97
- # return documents
98
 
99
  def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
100
-
101
- # # set defaults
102
- # if not model_selector:
103
- # model_selector = "gpt-3.5-turbo"
104
- # if not k_textbox:
105
- # k_textbox = 10
106
- # else:
107
- # k_textbox = int(k_textbox)
108
  if type(vectorstore) != list:
109
  if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
110
  if openai_api_key:
@@ -196,20 +112,20 @@ with block:
196
  submit_urls.click(get_vectorstore, inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, vs_state, agent_state], outputs=[vs_state, agent_state])
197
 
198
  # I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
199
- openai_api_key_textbox.change(
200
- set_chain_up,
201
- inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
202
- outputs=[agent_state],
203
- )
204
- model_selector.change(
205
- set_chain_up,
206
- inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
207
- outputs=[agent_state],
208
- )
209
- k_textbox.change(
210
- set_chain_up,
211
- inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
212
- outputs=[agent_state],
213
- )
214
 
215
  block.launch(debug=True)
 
3
  import gradio as gr
4
  from abc import ABC
5
  from typing import List, Optional, Any
6
+ import asyncio
7
  import chromadb
8
  import langchain
9
  # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 
14
  from langchain.document_loaders import TextLoader
15
  from langchain.docstore.document import Document
16
  from langchain.embeddings.base import Embeddings
 
17
 
18
  from chain import get_new_chain1
19
  from ingest import ingest_docs
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def set_chain_up(openai_api_key, model_selector, k_textbox, vectorstore, agent):
 
 
 
 
 
 
 
 
24
  if type(vectorstore) != list:
25
  if model_selector in ["gpt-3.5-turbo", "gpt-4"]:
26
  if openai_api_key:
 
112
  submit_urls.click(get_vectorstore, inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, vs_state, agent_state], outputs=[vs_state, agent_state])
113
 
114
  # I need to also parse this code in the docstore so I can ask it to fix silly things like this below:
115
+ # openai_api_key_textbox.change(
116
+ # set_chain_up,
117
+ # inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
118
+ # outputs=[agent_state],
119
+ # )
120
+ # model_selector.change(
121
+ # set_chain_up,
122
+ # inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
123
+ # outputs=[agent_state],
124
+ # )
125
+ # k_textbox.change(
126
+ # set_chain_up,
127
+ # inputs=[openai_api_key_textbox, model_selector, k_textbox, packagedocslist, agent_state],
128
+ # outputs=[agent_state],
129
+ # )
130
 
131
  block.launch(debug=True)
chain.py CHANGED
@@ -42,15 +42,15 @@ def get_new_chain1(vectorstore, model_selector, k_textbox) -> Chain:
42
  Standalone question:"""
43
 
44
  template = """You are an AI assistant for various open source libraries.
45
- You are given the following extracted parts of a long document and a question. Provide a conversational answer to the question.
46
- You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
47
- If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
48
- If the question is not about the package documentation, politely inform them that you are tuned to only answer questions about the package documentationz.
49
- Question: {question}
50
- =========
51
- {context}
52
- =========
53
- Answer in Markdown:"""
54
 
55
  # Construct a ChatVectorDBChain with a streaming llm for combine docs
56
  # and a separate, non-streaming llm for question generation
 
42
  Standalone question:"""
43
 
44
  template = """You are an AI assistant for various open source libraries.
45
+ You are given the following extracted parts of a long document and a question. Provide a conversational answer to the question.
46
+ You should only use hyperlinks that are explicitly listed as a source in the context. Do NOT make up a hyperlink that is not listed.
47
+ If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
48
+ If the question is not about the package documentation, politely inform them that you are tuned to only answer questions about the package documentationz.
49
+ Question: {question}
50
+ =========
51
+ {context}
52
+ =========
53
+ Answer in Markdown:"""
54
 
55
  # Construct a ChatVectorDBChain with a streaming llm for combine docs
56
  # and a separate, non-streaming llm for question generation
ingest.py CHANGED
@@ -1,13 +1,73 @@
1
  import pickle
2
-
3
  from langchain.document_loaders import SitemapLoader, ReadTheDocsLoader, TextLoader
4
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, PythonCodeTextSplitter, MarkdownTextSplitter
6
  from langchain.vectorstores.faiss import FAISS
7
  import itertools
8
  import os
9
- import fsspec
 
10
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def get_text(content):
13
  relevant_part = content.find("div", {"class": "markdown"})
@@ -18,74 +78,92 @@ def get_text(content):
18
 
19
  def ingest_docs(urls=[]):
20
  """Get documents from web pages."""
 
21
  folders=[]
22
- documents = []
 
 
 
 
 
 
 
 
 
 
23
  for url in urls:
24
- try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- if "local:" in url:
27
- folders.append(url.split('local:')[1])
28
- else:
29
- url = url[0]
30
- if url[0] == '/':
31
- url = url[1:]
32
- if url[-1] != '/':
33
- url += '/'
34
- org = url.split('/')[0]
35
- repo = url.split('/')[1]
36
- # join all strings after 2nd slash
37
- folder = '/'.join(url.split('/')[2:])
38
- if folder[-1] != '/':
39
- folder += '/'
40
- fs = fsspec.filesystem("github", org=org, repo=repo)
41
- # recursive copy
42
- destination = url
43
- destination.mkdir(exist_ok=True, parents=True)
44
- fs.get(fs.ls(folder), destination.as_posix(), recursive=True)
45
- folders.append(destination)
46
- except Exception as e:
47
- print(e)
48
- for folder in folders:
49
- try:
50
- py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
51
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
52
- md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
53
- local_repo_path_1 = folder
54
- known_exts = [".py", ".md", ".rst"]
55
- paths_by_ext = {}
56
- docs_by_ext = {}
57
- for ext in known_exts + ["other"]:
58
- docs_by_ext[ext] = []
59
- paths_by_ext[ext] = []
60
- for root, dirs, files in os.walk(local_repo_path_1):
61
- for file in files:
62
- file_path = os.path.join(root, file)
63
- rel_file_path = os.path.relpath(file_path, local_repo_path_1)
64
- for ext in paths_by_ext.keys():
65
- if '.' not in [i[0] for i in rel_file_path.split('/')]:
66
- if rel_file_path.endswith(ext):
67
- paths_by_ext[ext].append(rel_file_path)
68
- docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load())
69
- else:
70
- paths_by_ext["other"].append(rel_file_path)
71
- docs_by_ext["other"].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load())
72
 
73
- for ext in docs_by_ext.keys():
74
- if ext == ".py":
75
- documents += py_splitter.split_documents(docs_by_ext[ext])
76
- elif ext == ".md" or ext == ".rst":
77
- documents += md_splitter.split_documents(docs_by_ext[ext])
78
- else:
79
- documents += text_splitter.split_documents(docs_by_ext[ext])
80
- except Exception as e:
81
- print(e)
82
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  embeddings = HuggingFaceEmbeddings()
84
  vectorstore = FAISS.from_documents(documents, embeddings)
 
85
  # Save vectorstore
86
  with open("vectorstore.pkl", "wb") as f:
87
  pickle.dump(vectorstore, f)
88
  return vectorstore
 
89
 
90
  if __name__ == "__main__":
91
  ingest_docs()
 
1
  import pickle
2
+ import tempfile
3
  from langchain.document_loaders import SitemapLoader, ReadTheDocsLoader, TextLoader
4
  from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, PythonCodeTextSplitter, MarkdownTextSplitter
6
  from langchain.vectorstores.faiss import FAISS
7
  import itertools
8
  import os
9
+ from langchain.vectorstores import Chroma
10
+ import shutil
11
  from pathlib import Path
12
+ import subprocess
13
+ from git import Repo, Git
14
+ import tarfile
15
+ import chromadb
16
+ from abc import ABC
17
+ from typing import List, Optional, Any
18
+ from langchain.docstore.document import Document
19
+ from langchain.embeddings.base import Embeddings
20
+
21
+ class CachedChroma(Chroma, ABC):
22
+ """
23
+ Wrapper around Chroma to make caching embeddings easier.
24
+
25
+ It automatically uses a cached version of a specified collection, if available.
26
+ Example:
27
+ .. code-block:: python
28
+ from langchain.vectorstores import Chroma
29
+ from langchain.embeddings.openai import OpenAIEmbeddings
30
+ embeddings = OpenAIEmbeddings()
31
+ vectorstore = CachedChroma.from_documents_with_cache(
32
+ ".persisted_data", texts, embeddings, collection_name="fun_experiment"
33
+ )
34
+ """
35
+
36
+ @classmethod
37
+ def from_documents_with_cache(
38
+ cls,
39
+ persist_directory: str,
40
+ documents: List[Document],
41
+ embedding: Optional[Embeddings] = None,
42
+ ids: Optional[List[str]] = None,
43
+ collection_name: str = Chroma._LANGCHAIN_DEFAULT_COLLECTION_NAME,
44
+ client_settings: Optional[chromadb.config.Settings] = None,
45
+ **kwargs: Any,
46
+ ) -> Chroma:
47
+ settings = chromadb.config.Settings(
48
+ chroma_db_impl="duckdb+parquet",
49
+ persist_directory=persist_directory
50
+ )
51
+ client = chromadb.Client(settings)
52
+ collection_names = [c.name for c in client.list_collections()]
53
+
54
+ if collection_name in collection_names:
55
+ return Chroma(
56
+ collection_name=collection_name,
57
+ embedding_function=embedding,
58
+ persist_directory=persist_directory,
59
+ client_settings=client_settings,
60
+ )
61
+
62
+ return Chroma.from_documents(
63
+ documents=documents,
64
+ embedding=embedding,
65
+ ids=ids,
66
+ collection_name=collection_name,
67
+ persist_directory=persist_directory,
68
+ client_settings=client_settings,
69
+ **kwargs
70
+ )
71
 
72
  def get_text(content):
73
  relevant_part = content.find("div", {"class": "markdown"})
 
78
 
79
  def ingest_docs(urls=[]):
80
  """Get documents from web pages."""
81
+ cwd = os.getcwd()
82
  folders=[]
83
+ documents = []
84
+ shutil.rmtree('downloaded/', ignore_errors=True)
85
+ known_exts = ["py", "md"]
86
+ paths_by_ext = {}
87
+ docs_by_ext = {}
88
+ for ext in known_exts + ["other"]:
89
+ docs_by_ext[ext] = []
90
+ paths_by_ext[ext] = []
91
+ py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
92
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
93
+ md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
94
  for url in urls:
95
+ url = url[0]
96
+ if "local:" in url:
97
+ folders.append(url.split('local:')[1])
98
+ else:
99
+ destination = Path('downloaded/'+url)
100
+ destination.mkdir(exist_ok=True, parents=True)
101
+ destination = destination.as_posix()
102
+ if url[0] == '/':
103
+ url = url[1:]
104
+ org = url.split('/')[0]
105
+ repo = url.split('/')[1]
106
+ repo_url = f"https://github.com/{org}/{repo}.git"
107
+ # join all strings after 2nd slash
108
+ folder = '/'.join(url.split('/')[2:])
109
+ if folder[-1] == '/':
110
+ folder = folder[:-1]
111
+ if folder:
112
+ with tempfile.TemporaryDirectory() as temp_dir:
113
+ temp_path = Path(temp_dir)
114
 
115
+ # Initialize the Git repository
116
+ subprocess.run(["git", "init"], cwd=temp_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ # Add the remote repository
119
+ subprocess.run(["git", "remote", "add", "-f", "origin", repo_url], cwd=temp_path)
120
+
121
+ # Enable sparse-checkout
122
+ subprocess.run(["git", "config", "core.sparseCheckout", "true"], cwd=temp_path)
123
+
124
+ # Specify the folder to checkout
125
+ with open(temp_path / ".git" / "info" / "sparse-checkout", "w") as f:
126
+ f.write(f"{folder}/\n")
127
+
128
+ # Checkout the desired branch
129
+ res = subprocess.run(["git", "checkout", 'main'], cwd=temp_path)
130
+ if res.returncode == 1:
131
+ res = subprocess.run(["git", "checkout", "master"], cwd=temp_path)
132
+ res = subprocess.run(["cp", "-r", (temp_path / folder).as_posix(), '/'.join(destination.split('/')[:-1])])
133
+ folders.append(destination)
134
+
135
+ for folder in folders:
136
+ local_repo_path_1 = folder
137
+ for root, dirs, files in os.walk(local_repo_path_1):
138
+ for file in files:
139
+ file_path = os.path.join(root, file)
140
+ rel_file_path = os.path.relpath(file_path, local_repo_path_1)
141
+ ext = rel_file_path.split('.')[-1]
142
+ try:
143
+ if '.' not in [i[0] for i in rel_file_path.split('/')]:
144
+ if paths_by_ext.get(rel_file_path.split('.')[-1]) is None:
145
+ paths_by_ext["other"].append(rel_file_path)
146
+ docs_by_ext["other"].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
147
+ else:
148
+ paths_by_ext[ext].append(rel_file_path)
149
+ docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load()[0])
150
+ except Exception as e:
151
+ continue
152
+ for ext in docs_by_ext.keys():
153
+ if ext == "py":
154
+ documents += py_splitter.split_documents(docs_by_ext[ext])
155
+ if ext == "md":
156
+ documents += md_splitter.split_documents(docs_by_ext[ext])
157
+ # else:
158
+ # documents += text_splitter.split_documents(docs_by_ext[ext]
159
  embeddings = HuggingFaceEmbeddings()
160
  vectorstore = FAISS.from_documents(documents, embeddings)
161
+ # vectorstore = CachedChroma.from_documents_with_cache(".persisted_data", documents, embeddings)
162
  # Save vectorstore
163
  with open("vectorstore.pkl", "wb") as f:
164
  pickle.dump(vectorstore, f)
165
  return vectorstore
166
+
167
 
168
  if __name__ == "__main__":
169
  ingest_docs()