xangma commited on
Commit
a37484c
1 Parent(s): cd62201
Files changed (1) hide show
  1. ingest.py +56 -51
ingest.py CHANGED
@@ -6,6 +6,8 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTex
6
  from langchain.vectorstores.faiss import FAISS
7
  import itertools
8
  import os
 
 
9
 
10
  def get_text(content):
11
  relevant_part = content.find("div", {"class": "markdown"})
@@ -16,62 +18,65 @@ def get_text(content):
16
 
17
  def ingest_docs(urls=[]):
18
  """Get documents from web pages."""
 
19
  documents = []
20
- text_splitter = RecursiveCharacterTextSplitter(
21
- chunk_size=1000,
22
- chunk_overlap=200,
23
- )
24
  for url in urls:
25
  try:
26
- url = url[0]
27
- if len([i for i in map(''.join, itertools.product(*zip('sitemap'.upper(), 'sitemap'.lower()))) if i in url]) > 0:
28
- loader = SitemapLoader(
29
- web_path=url, parsing_function=get_text
30
- )
31
- elif len([i for i in map(''.join, itertools.product(*zip('readthedocs'.upper(), 'readthedocs'.lower()))) if i in url]) > 0:
32
- loader = ReadTheDocsLoader(
33
- path=url
34
- )
35
- elif "local:" in url:
36
- local_repo_path_1 = url.split('local:')[1]
37
- loaders = []
38
- known_exts = [".py", ".md"]
39
- paths_by_ext = {}
40
- docs_by_ext = {}
41
- for ext in known_exts + ["other"]:
42
- docs_by_ext[ext] = []
43
- paths_by_ext[ext] = []
44
- for root, dirs, files in os.walk(local_repo_path_1):
45
- for file in files:
46
- file_path = os.path.join(root, file)
47
- rel_file_path = os.path.relpath(file_path, local_repo_path_1)
48
- for ext in paths_by_ext.keys():
49
- if '.' not in [i[0] for i in rel_file_path.split('/')]:
50
- if rel_file_path.endswith(ext):
51
- paths_by_ext[ext].append(rel_file_path)
52
- else:
53
- paths_by_ext["other"].append(rel_file_path)
54
 
55
- # for each extension, load the files and split them
56
- for ext in paths_by_ext.keys():
57
- for i in range(len(paths_by_ext[ext])):
58
- try:
59
- docs_by_ext[ext] += TextLoader(os.path.join(local_repo_path_1, paths_by_ext[ext][i])).load()
60
- except Exception as e:
61
- print(e)
62
- continue
63
- py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
64
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
65
- md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
66
- for ext in docs_by_ext.keys():
67
- if ext == ".py":
68
- documents += py_splitter.split_documents(docs_by_ext[ext])
69
- elif ext == ".md":
70
- documents += md_splitter.split_documents(docs_by_ext[ext])
71
- else:
72
- documents += text_splitter.split_documents(docs_by_ext[ext])
73
  else:
74
- raise ValueError("No loader found for this url")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  except Exception as e:
76
  print(e)
77
  continue
 
6
  from langchain.vectorstores.faiss import FAISS
7
  import itertools
8
  import os
9
+ import fsspec
10
+ from pathlib import Path
11
 
12
  def get_text(content):
13
  relevant_part = content.find("div", {"class": "markdown"})
 
18
 
19
  def ingest_docs(urls=[]):
20
  """Get documents from web pages."""
21
+ folders=[]
22
  documents = []
 
 
 
 
23
  for url in urls:
24
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ if "local:" in url:
27
+ folders.append(url.split('local:')[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  else:
29
+ url = url[0]
30
+ if url[0] == '/':
31
+ url = url[1:]
32
+ if url[-1] != '/':
33
+ url += '/'
34
+ org = url.split('/')[0]
35
+ repo = url.split('/')[1]
36
+ # join all strings after 2nd slash
37
+ folder = '/'.join(url.split('/')[2:])
38
+ if folder[-1] != '/':
39
+ folder += '/'
40
+ fs = fsspec.filesystem("github", org=org, repo=repo)
41
+ # recursive copy
42
+ destination = url
43
+ destination.mkdir(exist_ok=True, parents=True)
44
+ fs.get(fs.ls(folder), destination.as_posix(), recursive=True)
45
+ folders.append(destination)
46
+ except Exception as e:
47
+ print(e)
48
+ for folder in folders:
49
+ try:
50
+ py_splitter = PythonCodeTextSplitter(chunk_size=1000, chunk_overlap=0)
51
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
52
+ md_splitter = MarkdownTextSplitter(chunk_size=1000, chunk_overlap=0)
53
+ local_repo_path_1 = folder
54
+ known_exts = [".py", ".md", ".rst"]
55
+ paths_by_ext = {}
56
+ docs_by_ext = {}
57
+ for ext in known_exts + ["other"]:
58
+ docs_by_ext[ext] = []
59
+ paths_by_ext[ext] = []
60
+ for root, dirs, files in os.walk(local_repo_path_1):
61
+ for file in files:
62
+ file_path = os.path.join(root, file)
63
+ rel_file_path = os.path.relpath(file_path, local_repo_path_1)
64
+ for ext in paths_by_ext.keys():
65
+ if '.' not in [i[0] for i in rel_file_path.split('/')]:
66
+ if rel_file_path.endswith(ext):
67
+ paths_by_ext[ext].append(rel_file_path)
68
+ docs_by_ext[ext].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load())
69
+ else:
70
+ paths_by_ext["other"].append(rel_file_path)
71
+ docs_by_ext["other"].append(TextLoader(os.path.join(local_repo_path_1, rel_file_path)).load())
72
+
73
+ for ext in docs_by_ext.keys():
74
+ if ext == ".py":
75
+ documents += py_splitter.split_documents(docs_by_ext[ext])
76
+ elif ext == ".md" or ext == ".rst":
77
+ documents += md_splitter.split_documents(docs_by_ext[ext])
78
+ else:
79
+ documents += text_splitter.split_documents(docs_by_ext[ext])
80
  except Exception as e:
81
  print(e)
82
  continue