Spaces:
Runtime error
Runtime error
Pytorch (#17)
Browse files- buster/docparser.py +6 -2
buster/docparser.py
CHANGED
@@ -14,6 +14,7 @@ EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-0
|
|
14 |
|
15 |
BASE_URL_MILA = "https://docs.mila.quebec/"
|
16 |
BASE_URL_ORION = "https://orion.readthedocs.io/en/stable/"
|
|
|
17 |
|
18 |
|
19 |
PICKLE_EXTENSIONS = [".gz", ".bz2", ".zip", ".xz", ".zst", ".tar", ".tar.gz", ".tar.xz", ".tar.bz2"]
|
@@ -32,7 +33,9 @@ def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
|
|
32 |
return section
|
33 |
|
34 |
|
35 |
-
def get_all_documents(
|
|
|
|
|
36 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
37 |
|
38 |
Sections are broken into subsections if they are longer than `max_section_length`.
|
@@ -74,7 +77,8 @@ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 20
|
|
74 |
sections.extend(section_chunks)
|
75 |
urls.extend(url_chunks)
|
76 |
names.extend(name_chunks)
|
77 |
-
|
|
|
78 |
sections.append(section)
|
79 |
urls.append(url)
|
80 |
names.append(name)
|
|
|
14 |
|
15 |
BASE_URL_MILA = "https://docs.mila.quebec/"
|
16 |
BASE_URL_ORION = "https://orion.readthedocs.io/en/stable/"
|
17 |
+
BASE_URL_PYTORCH = "https://pytorch.org/docs/stable/"
|
18 |
|
19 |
|
20 |
PICKLE_EXTENSIONS = [".gz", ".bz2", ".zip", ".xz", ".zst", ".tar", ".tar.gz", ".tar.xz", ".tar.bz2"]
|
|
|
33 |
return section
|
34 |
|
35 |
|
36 |
+
def get_all_documents(
|
37 |
+
root_dir: str, base_url: str, min_section_length: int = 100, max_section_length: int = 2000
|
38 |
+
) -> pd.DataFrame:
|
39 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
40 |
|
41 |
Sections are broken into subsections if they are longer than `max_section_length`.
|
|
|
77 |
sections.extend(section_chunks)
|
78 |
urls.extend(url_chunks)
|
79 |
names.extend(name_chunks)
|
80 |
+
# If text is not too short, add in 1 chunk
|
81 |
+
elif len(section) > min_section_length:
|
82 |
sections.append(section)
|
83 |
urls.append(url)
|
84 |
names.append(name)
|