hbertrand commited on
Commit
0b4f7e4
1 Parent(s): 90ae9dd

parsing names

Browse files
Files changed (2) hide show
  1. buster/docparser.py +49 -35
  2. requirements.txt +0 -3
buster/docparser.py CHANGED
@@ -1,12 +1,13 @@
1
  import glob
 
2
  import os
3
- import pickle
4
 
5
  import pandas as pd
6
  import tiktoken
7
  from bs4 import BeautifulSoup
8
  from openai.embeddings_utils import cosine_similarity, get_embedding
9
 
 
10
  EMBEDDING_MODEL = "text-embedding-ada-002"
11
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
12
 
@@ -14,7 +15,7 @@ EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-0
14
  BASE_URL = "https://docs.mila.quebec/"
15
 
16
 
17
- def get_all_sections(root_dir: str, max_section_length: int = 3000) -> tuple[list[str], list[str]]:
18
  """Parse all HTML files in `root_dir`, and extract all sections.
19
 
20
  Sections are broken into subsections if they are longer than `max_section_length`.
@@ -22,66 +23,78 @@ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> tuple[lis
22
  """
23
  files = glob.glob("*.html", root_dir=root_dir)
24
 
25
- # Recurse until sections are small enough
26
- def get_all_subsections(soup: BeautifulSoup, level: int) -> tuple[list[str], list[str]]:
27
- if level >= 5:
28
- return [], []
29
-
30
  found = soup.find_all('a', href=True, class_="headerlink")
31
 
32
  sections = []
33
  urls = []
 
34
  for section_found in found:
35
  section_soup = section_found.parent.parent
36
- section = section_soup.text
 
 
 
 
 
 
 
 
 
37
  url = section_found['href']
 
38
 
 
39
  if len(section) > max_section_length:
40
- s, u = get_all_subsections(section_soup, level + 1)
41
- sections.extend(s)
42
- urls.extend(u)
 
 
 
 
 
 
 
43
  else:
44
  sections.append(section)
45
  urls.append(url)
 
46
 
47
- return sections, urls
48
 
49
  sections = []
50
  urls = []
 
51
  for file in files:
52
  filepath = os.path.join(root_dir, file)
53
  with open(filepath, "r") as file:
54
  source = file.read()
55
 
56
  soup = BeautifulSoup(source, "html.parser")
57
- sections_file, urls_file = get_all_subsections(soup, 2)
58
  sections.extend(sections_file)
59
 
60
  urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
61
  urls.extend(urls_file)
62
 
63
- return sections, urls
64
-
65
 
66
- def write_sections(filepath: str, sections: list[str]):
67
- with open(filepath, "wb") as f:
68
- pickle.dump(sections, f)
 
 
69
 
 
70
 
71
- def read_sections(filepath: str) -> list[str]:
72
- with open(filepath, "rb") as fp:
73
- sections = pickle.load(fp)
74
 
75
- return sections
 
76
 
77
 
78
- def load_documents(fname: str) -> pd.DataFrame:
79
- df = pd.DataFrame()
80
-
81
- with open(fname, "rb") as fp:
82
- documents = pickle.load(fp)
83
- df["documents"] = documents
84
- return df
85
 
86
 
87
  def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
@@ -97,7 +110,7 @@ def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
97
 
98
  def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
99
  # Get all documents and precompute their embeddings
100
- df = load_documents(filepath)
101
  df = compute_n_tokens(df)
102
  df = precompute_embeddings(df)
103
  df.to_csv(output_csv)
@@ -106,14 +119,15 @@ def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
106
 
107
  if __name__ == "__main__":
108
  root_dir = "/home/hadrien/perso/mila-docs/output/"
109
- save_filepath = os.path.join(root_dir, "sections.pkl")
110
 
111
  # How to write
112
- sections = get_all_sections(root_dir)
113
- write_sections(save_filepath, sections)
114
 
115
  # How to load
116
- sections = read_sections(save_filepath)
117
 
118
- # precopmute the document embeddings
119
  df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
 
 
1
  import glob
2
+ import math
3
  import os
 
4
 
5
  import pandas as pd
6
  import tiktoken
7
  from bs4 import BeautifulSoup
8
  from openai.embeddings_utils import cosine_similarity, get_embedding
9
 
10
+
11
  EMBEDDING_MODEL = "text-embedding-ada-002"
12
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
13
 
 
15
  BASE_URL = "https://docs.mila.quebec/"
16
 
17
 
18
+ def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataFrame:
19
  """Parse all HTML files in `root_dir`, and extract all sections.
20
 
21
  Sections are broken into subsections if they are longer than `max_section_length`.
 
23
  """
24
  files = glob.glob("*.html", root_dir=root_dir)
25
 
26
+ def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
 
 
 
 
27
  found = soup.find_all('a', href=True, class_="headerlink")
28
 
29
  sections = []
30
  urls = []
31
+ names = []
32
  for section_found in found:
33
  section_soup = section_found.parent.parent
34
+ section_href = section_soup.find_all('a', href=True, class_="headerlink")
35
+
36
+ # If sections has subsections, keep only the part before the first subsection
37
+ if len(section_href) > 1:
38
+ section_siblings = section_soup.section.previous_siblings
39
+ section = [sibling.text for sibling in section_siblings]
40
+ section = ''.join(section[::-1])[1:]
41
+ else:
42
+ section = section_soup.text[1:]
43
+
44
  url = section_found['href']
45
+ name = section_found.parent.text[:-1]
46
 
47
+ # If text is too long, split into chunks of equal sizes
48
  if len(section) > max_section_length:
49
+ n_chunks = math.ceil(len(section) / float(max_section_length))
50
+ separator_index = math.floor(len(section) / n_chunks)
51
+
52
+ section_chunks = [section[separator_index * i: separator_index * (i + 1)] for i in range(n_chunks)]
53
+ url_chunks = [url] * n_chunks
54
+ name_chunks = [name] * n_chunks
55
+
56
+ sections.extend(section_chunks)
57
+ urls.extend(url_chunks)
58
+ names.extend(name_chunks)
59
  else:
60
  sections.append(section)
61
  urls.append(url)
62
+ names.append(name)
63
 
64
+ return sections, urls, names
65
 
66
  sections = []
67
  urls = []
68
+ names = []
69
  for file in files:
70
  filepath = os.path.join(root_dir, file)
71
  with open(filepath, "r") as file:
72
  source = file.read()
73
 
74
  soup = BeautifulSoup(source, "html.parser")
75
+ sections_file, urls_file, names_file = get_all_subsections(soup)
76
  sections.extend(sections_file)
77
 
78
  urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
79
  urls.extend(urls_file)
80
 
81
+ names.extend(names_file)
 
82
 
83
+ documents_df = pd.DataFrame.from_dict({
84
+ 'name': names,
85
+ 'url': urls,
86
+ 'text': sections
87
+ })
88
 
89
+ return documents_df
90
 
 
 
 
91
 
92
+ def write_documents(filepath: str, documents_df: pd.DataFrame):
93
+ documents_df.to_csv(filepath)
94
 
95
 
96
+ def read_documents(filepath: str) -> pd.DataFrame:
97
+ return pd.read_csv(filepath)
 
 
 
 
 
98
 
99
 
100
  def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
 
110
 
111
  def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
112
  # Get all documents and precompute their embeddings
113
+ df = read_documents(filepath)['text']
114
  df = compute_n_tokens(df)
115
  df = precompute_embeddings(df)
116
  df.to_csv(output_csv)
 
119
 
120
  if __name__ == "__main__":
121
  root_dir = "/home/hadrien/perso/mila-docs/output/"
122
+ save_filepath = os.path.join(root_dir, "documents.csv")
123
 
124
  # How to write
125
+ documents_df = get_all_documents(root_dir)
126
+ write_documents(save_filepath, documents_df)
127
 
128
  # How to load
129
+ documents_df = read_documents(save_filepath)
130
 
131
+ # precompute the document embeddings
132
  df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
133
+
requirements.txt CHANGED
@@ -1,8 +1,5 @@
1
  bs4
2
  numpy
3
- <<<<<<< HEAD
4
  tiktoken
5
- =======
6
  openai
7
  pandas
8
- >>>>>>> fe2ece9 (parsing urls)
 
1
  bs4
2
  numpy
 
3
  tiktoken
 
4
  openai
5
  pandas