hbertrand commited on
Commit
8756061
1 Parent(s): 413b78d

orion support (#11)

Browse files

* better tables

* black

* orion support

* black

.gitignore CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  *.py[cod]
 
1
+ # Project specific stuff
2
+ buster/data/
3
+
4
  # Byte-compiled / optimized / DLL files
5
  __pycache__/
6
  *.py[cod]
buster/data/document_embeddings.csv DELETED
The diff for this file is too large to render. See raw diff
 
buster/data/documents.csv DELETED
The diff for this file is too large to render. See raw diff
 
buster/docparser.py CHANGED
@@ -12,7 +12,8 @@ EMBEDDING_MODEL = "text-embedding-ada-002"
12
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
13
 
14
 
15
- BASE_URL = "https://docs.mila.quebec/"
 
16
 
17
 
18
  def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
@@ -28,13 +29,13 @@ def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
28
  return section
29
 
30
 
31
- def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataFrame:
32
  """Parse all HTML files in `root_dir`, and extract all sections.
33
 
34
  Sections are broken into subsections if they are longer than `max_section_length`.
35
- Sections correspond to h2 HTML tags, and move on to h3 then h4 if needed.
36
  """
37
- files = glob.glob("*.html", root_dir=root_dir)
38
 
39
  def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
40
  found = soup.find_all("a", href=True, class_="headerlink")
@@ -47,7 +48,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataF
47
  section_href = section_soup.find_all("a", href=True, class_="headerlink")
48
 
49
  # If sections has subsections, keep only the part before the first subsection
50
- if len(section_href) > 1:
51
  section_siblings = list(section_soup.section.previous_siblings)[::-1]
52
  section = parse_section(section_siblings)
53
  else:
@@ -87,7 +88,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataF
87
  sections_file, urls_file, names_file = get_all_subsections(soup)
88
  sections.extend(sections_file)
89
 
90
- urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
91
  urls.extend(urls_file)
92
 
93
  names.extend(names_file)
 
12
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
13
 
14
 
15
+ BASE_URL_MILA = "https://docs.mila.quebec/"
16
+ BASE_URL_ORION = "https://orion.readthedocs.io/en/stable/"
17
 
18
 
19
  def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
 
29
  return section
30
 
31
 
32
+ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 2000) -> pd.DataFrame:
33
  """Parse all HTML files in `root_dir`, and extract all sections.
34
 
35
  Sections are broken into subsections if they are longer than `max_section_length`.
36
+ Sections correspond to `section` HTML tags that have a headerlink attached.
37
  """
38
+ files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
39
 
40
  def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
41
  found = soup.find_all("a", href=True, class_="headerlink")
 
48
  section_href = section_soup.find_all("a", href=True, class_="headerlink")
49
 
50
  # If sections has subsections, keep only the part before the first subsection
51
+ if len(section_href) > 1 and section_soup.section is not None:
52
  section_siblings = list(section_soup.section.previous_siblings)[::-1]
53
  section = parse_section(section_siblings)
54
  else:
 
88
  sections_file, urls_file, names_file = get_all_subsections(soup)
89
  sections.extend(sections_file)
90
 
91
+ urls_file = [base_url + os.path.basename(file.name) + url for url in urls_file]
92
  urls.extend(urls_file)
93
 
94
  names.extend(names_file)
requirements.txt CHANGED
@@ -1,11 +1,10 @@
1
- pandas
2
- openai[embeddings]
3
  bs4
 
4
  numpy
5
- tiktoken
6
- openai
7
  pandas
 
8
  scikit-learn
 
9
  tenacity
10
- matplotlib
11
- plotly
 
 
 
1
  bs4
2
+ matplotlib
3
  numpy
 
 
4
  pandas
5
+ plotly
6
  scikit-learn
7
+ tabulate
8
  tenacity
9
+ tiktoken
10
+ openai[embeddings]