hbertrand commited on
Commit
8ef8e62
1 Parent(s): 4dcc0d8

Pytorch (#17)

Browse files
Files changed (1) hide show
  1. buster/docparser.py +6 -2
buster/docparser.py CHANGED
@@ -14,6 +14,7 @@ EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-0
14
 
15
  BASE_URL_MILA = "https://docs.mila.quebec/"
16
  BASE_URL_ORION = "https://orion.readthedocs.io/en/stable/"
 
17
 
18
 
19
  PICKLE_EXTENSIONS = [".gz", ".bz2", ".zip", ".xz", ".zst", ".tar", ".tar.gz", ".tar.xz", ".tar.bz2"]
@@ -32,7 +33,9 @@ def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
32
  return section
33
 
34
 
35
- def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 2000) -> pd.DataFrame:
 
 
36
  """Parse all HTML files in `root_dir`, and extract all sections.
37
 
38
  Sections are broken into subsections if they are longer than `max_section_length`.
@@ -74,7 +77,8 @@ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 20
74
  sections.extend(section_chunks)
75
  urls.extend(url_chunks)
76
  names.extend(name_chunks)
77
- else:
 
78
  sections.append(section)
79
  urls.append(url)
80
  names.append(name)
14
 
15
  BASE_URL_MILA = "https://docs.mila.quebec/"
16
  BASE_URL_ORION = "https://orion.readthedocs.io/en/stable/"
17
+ BASE_URL_PYTORCH = "https://pytorch.org/docs/stable/"
18
 
19
 
20
  PICKLE_EXTENSIONS = [".gz", ".bz2", ".zip", ".xz", ".zst", ".tar", ".tar.gz", ".tar.xz", ".tar.bz2"]
33
  return section
34
 
35
 
36
+ def get_all_documents(
37
+ root_dir: str, base_url: str, min_section_length: int = 100, max_section_length: int = 2000
38
+ ) -> pd.DataFrame:
39
  """Parse all HTML files in `root_dir`, and extract all sections.
40
 
41
  Sections are broken into subsections if they are longer than `max_section_length`.
77
  sections.extend(section_chunks)
78
  urls.extend(url_chunks)
79
  names.extend(name_chunks)
80
+ # If text is not too short, add in 1 chunk
81
+ elif len(section) > min_section_length:
82
  sections.append(section)
83
  urls.append(url)
84
  names.append(name)