orion support (#11)
Browse files* better tables
* black
* orion support
* black
- .gitignore +3 -0
- buster/data/document_embeddings.csv +0 -0
- buster/data/documents.csv +0 -0
- buster/docparser.py +7 -6
- requirements.txt +5 -6
.gitignore
CHANGED
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
1 |
# Byte-compiled / optimized / DLL files
|
2 |
__pycache__/
|
3 |
*.py[cod]
|
|
|
1 |
+
# Project specific stuff
|
2 |
+
buster/data/
|
3 |
+
|
4 |
# Byte-compiled / optimized / DLL files
|
5 |
__pycache__/
|
6 |
*.py[cod]
|
buster/data/document_embeddings.csv
DELETED
The diff for this file is too large to render.
See raw diff
|
|
buster/data/documents.csv
DELETED
The diff for this file is too large to render.
See raw diff
|
|
buster/docparser.py
CHANGED
@@ -12,7 +12,8 @@ EMBEDDING_MODEL = "text-embedding-ada-002"
|
|
12 |
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
|
13 |
|
14 |
|
15 |
-
|
|
|
16 |
|
17 |
|
18 |
def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
|
@@ -28,13 +29,13 @@ def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
|
|
28 |
return section
|
29 |
|
30 |
|
31 |
-
def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataFrame:
|
32 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
33 |
|
34 |
Sections are broken into subsections if they are longer than `max_section_length`.
|
35 |
-
Sections correspond to
|
36 |
"""
|
37 |
-
files = glob.glob("
|
38 |
|
39 |
def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
|
40 |
found = soup.find_all("a", href=True, class_="headerlink")
|
@@ -47,7 +48,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataF
|
|
47 |
section_href = section_soup.find_all("a", href=True, class_="headerlink")
|
48 |
|
49 |
# If sections has subsections, keep only the part before the first subsection
|
50 |
-
if len(section_href) > 1:
|
51 |
section_siblings = list(section_soup.section.previous_siblings)[::-1]
|
52 |
section = parse_section(section_siblings)
|
53 |
else:
|
@@ -87,7 +88,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataF
|
|
87 |
sections_file, urls_file, names_file = get_all_subsections(soup)
|
88 |
sections.extend(sections_file)
|
89 |
|
90 |
-
urls_file = [
|
91 |
urls.extend(urls_file)
|
92 |
|
93 |
names.extend(names_file)
|
|
|
12 |
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
|
13 |
|
14 |
|
15 |
+
BASE_URL_MILA = "https://docs.mila.quebec/"
|
16 |
+
BASE_URL_ORION = "https://orion.readthedocs.io/en/stable/"
|
17 |
|
18 |
|
19 |
def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
|
|
|
29 |
return section
|
30 |
|
31 |
|
32 |
+
def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 2000) -> pd.DataFrame:
|
33 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
34 |
|
35 |
Sections are broken into subsections if they are longer than `max_section_length`.
|
36 |
+
Sections correspond to `section` HTML tags that have a headerlink attached.
|
37 |
"""
|
38 |
+
files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
|
39 |
|
40 |
def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
|
41 |
found = soup.find_all("a", href=True, class_="headerlink")
|
|
|
48 |
section_href = section_soup.find_all("a", href=True, class_="headerlink")
|
49 |
|
50 |
# If sections has subsections, keep only the part before the first subsection
|
51 |
+
if len(section_href) > 1 and section_soup.section is not None:
|
52 |
section_siblings = list(section_soup.section.previous_siblings)[::-1]
|
53 |
section = parse_section(section_siblings)
|
54 |
else:
|
|
|
88 |
sections_file, urls_file, names_file = get_all_subsections(soup)
|
89 |
sections.extend(sections_file)
|
90 |
|
91 |
+
urls_file = [base_url + os.path.basename(file.name) + url for url in urls_file]
|
92 |
urls.extend(urls_file)
|
93 |
|
94 |
names.extend(names_file)
|
requirements.txt
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
-
pandas
|
2 |
-
openai[embeddings]
|
3 |
bs4
|
|
|
4 |
numpy
|
5 |
-
tiktoken
|
6 |
-
openai
|
7 |
pandas
|
|
|
8 |
scikit-learn
|
|
|
9 |
tenacity
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
1 |
bs4
|
2 |
+
matplotlib
|
3 |
numpy
|
|
|
|
|
4 |
pandas
|
5 |
+
plotly
|
6 |
scikit-learn
|
7 |
+
tabulate
|
8 |
tenacity
|
9 |
+
tiktoken
|
10 |
+
openai[embeddings]
|