jerpint commited on
Commit
f988598
2 Parent(s): b140ffc eec81fa

Merge pull request #5 from jerpint/parse_docs

Browse files
.gitignore ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ albenchmark/data/
7
+
8
+ # Ignore notebooks by default
9
+ *.ipynb
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ pip-wheel-metadata/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100
+ __pypackages__/
101
+
102
+ # Celery stuff
103
+ celerybeat-schedule
104
+ celerybeat.pid
105
+
106
+ # SageMath parsed files
107
+ *.sage.py
108
+
109
+ # Environments
110
+ .env
111
+ .venv
112
+ env/
113
+ venv/
114
+ ENV/
115
+ env.bak/
116
+ venv.bak/
117
+
118
+ # Spyder project settings
119
+ .spyderproject
120
+ .spyproject
121
+
122
+ # Rope project settings
123
+ .ropeproject
124
+
125
+ # mkdocs documentation
126
+ /site
127
+
128
+ # mypy
129
+ .mypy_cache/
130
+ .dmypy.json
131
+ dmypy.json
132
+
133
+ # Pyre type checker
134
+ .pyre/
135
+
136
+ # VSCode
137
+ .vscode/
buster/chatbot.py CHANGED
@@ -1,15 +1,16 @@
1
  import logging
2
- import pickle
3
 
4
  import numpy as np
5
  import openai
6
  import pandas as pd
7
- from docparser import EMBEDDING_MODEL
8
  from openai.embeddings_utils import cosine_similarity, get_embedding
9
 
 
 
10
  logger = logging.getLogger(__name__)
11
  logging.basicConfig(level=logging.INFO)
12
 
 
13
  # search through the reviews for a specific product
14
  def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame:
15
  product_embedding = get_embedding(
@@ -33,7 +34,7 @@ def engineer_prompt(question: str, documents: list[str]) -> str:
33
  def get_gpt_response(question: str, df) -> str:
34
  # rank the documents, get the highest scoring doc and generate the prompt
35
  candidates = rank_documents(df, query=question, top_k=1)
36
- documents = candidates.documents.to_list()
37
  prompt = engineer_prompt(question, documents)
38
 
39
  logger.info(f"querying GPT...")
 
1
  import logging
 
2
 
3
  import numpy as np
4
  import openai
5
  import pandas as pd
 
6
  from openai.embeddings_utils import cosine_similarity, get_embedding
7
 
8
+ from buster.docparser import EMBEDDING_MODEL
9
+
10
  logger = logging.getLogger(__name__)
11
  logging.basicConfig(level=logging.INFO)
12
 
13
+
14
  # search through the reviews for a specific product
15
  def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame:
16
  product_embedding = get_embedding(
 
34
  def get_gpt_response(question: str, df) -> str:
35
  # rank the documents, get the highest scoring doc and generate the prompt
36
  candidates = rank_documents(df, query=question, top_k=1)
37
+ documents = candidates.text.to_list()
38
  prompt = engineer_prompt(question, documents)
39
 
40
  logger.info(f"querying GPT...")
buster/data/document_embeddings.csv CHANGED
The diff for this file is too large to render. See raw diff
 
buster/data/{sections.pkl → documents.csv} RENAMED
Binary files a/buster/data/sections.pkl and b/buster/data/documents.csv differ
 
buster/docparser.py CHANGED
@@ -1,17 +1,20 @@
1
  import glob
 
2
  import os
3
- import pickle
4
 
5
  import pandas as pd
6
  import tiktoken
7
  from bs4 import BeautifulSoup
8
- from openai.embeddings_utils import cosine_similarity, get_embedding
9
 
10
  EMBEDDING_MODEL = "text-embedding-ada-002"
11
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
12
 
13
 
14
- def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
 
 
 
15
  """Parse all HTML files in `root_dir`, and extract all sections.
16
 
17
  Sections are broken into subsections if they are longer than `max_section_length`.
@@ -19,85 +22,106 @@ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]
19
  """
20
  files = glob.glob("*.html", root_dir=root_dir)
21
 
22
- selector = "section > section"
23
-
24
- # Recurse until sections are small enough
25
- def get_all_subsections(soup, selector: str) -> list[str]:
26
- found = soup.select(selector)
27
- data = [x.text.split(";")[-1].strip() for x in found]
28
 
29
  sections = []
30
- for i, section in enumerate(data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  if len(section) > max_section_length:
32
- sections.extend(get_all_subsections(found[i], selector + " > section"))
 
 
 
 
 
 
 
 
 
33
  else:
34
  sections.append(section)
 
 
35
 
36
- return sections
37
 
38
  sections = []
 
 
39
  for file in files:
40
  filepath = os.path.join(root_dir, file)
41
  with open(filepath, "r") as file:
42
  source = file.read()
43
 
44
  soup = BeautifulSoup(source, "html.parser")
45
- sections.extend(get_all_subsections(soup, selector))
 
46
 
47
- return sections
 
48
 
 
49
 
50
- def write_sections(filepath: str, sections: list[str]):
51
- with open(filepath, "wb") as f:
52
- pickle.dump(sections, f)
53
 
 
54
 
55
- def read_sections(filepath: str) -> list[str]:
56
- with open(filepath, "rb") as fp:
57
- sections = pickle.load(fp)
58
 
59
- return sections
 
60
 
61
 
62
- def load_documents(fname: str) -> pd.DataFrame:
63
- df = pd.DataFrame()
64
-
65
- with open(fname, "rb") as fp:
66
- documents = pickle.load(fp)
67
- df["documents"] = documents
68
- return df
69
 
70
 
71
  def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
72
  encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
73
- df["n_tokens"] = df.documents.apply(lambda x: len(encoding.encode(x)))
74
  return df
75
 
76
 
77
  def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
78
- df["embedding"] = df.documents.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
79
  return df
80
 
81
 
82
  def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
83
  # Get all documents and precompute their embeddings
84
- df = load_documents(filepath)
85
  df = compute_n_tokens(df)
86
  df = precompute_embeddings(df)
87
- df.to_csv(output_csv)
88
  return df
89
 
90
 
91
  if __name__ == "__main__":
92
  root_dir = "/home/hadrien/perso/mila-docs/output/"
93
- save_filepath = os.path.join(root_dir, "sections.pkl")
94
 
95
  # How to write
96
- sections = get_all_sections(root_dir)
97
- write_sections(save_filepath, sections)
98
 
99
  # How to load
100
- sections = read_sections(save_filepath)
101
 
102
- # precopmute the document embeddings
103
  df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
 
1
  import glob
2
+ import math
3
  import os
 
4
 
5
  import pandas as pd
6
  import tiktoken
7
  from bs4 import BeautifulSoup
8
+ from openai.embeddings_utils import get_embedding
9
 
10
  EMBEDDING_MODEL = "text-embedding-ada-002"
11
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
12
 
13
 
14
+ BASE_URL = "https://docs.mila.quebec/"
15
+
16
+
17
+ def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataFrame:
18
  """Parse all HTML files in `root_dir`, and extract all sections.
19
 
20
  Sections are broken into subsections if they are longer than `max_section_length`.
 
22
  """
23
  files = glob.glob("*.html", root_dir=root_dir)
24
 
25
+ def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
26
+ found = soup.find_all("a", href=True, class_="headerlink")
 
 
 
 
27
 
28
  sections = []
29
+ urls = []
30
+ names = []
31
+ for section_found in found:
32
+ section_soup = section_found.parent.parent
33
+ section_href = section_soup.find_all("a", href=True, class_="headerlink")
34
+
35
+ # If sections has subsections, keep only the part before the first subsection
36
+ if len(section_href) > 1:
37
+ section_siblings = section_soup.section.previous_siblings
38
+ section = [sibling.text for sibling in section_siblings]
39
+ section = "".join(section[::-1])[1:]
40
+ else:
41
+ section = section_soup.text[1:]
42
+
43
+ url = section_found["href"]
44
+ name = section_found.parent.text[:-1]
45
+
46
+ # If text is too long, split into chunks of equal sizes
47
  if len(section) > max_section_length:
48
+ n_chunks = math.ceil(len(section) / float(max_section_length))
49
+ separator_index = math.floor(len(section) / n_chunks)
50
+
51
+ section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
52
+ url_chunks = [url] * n_chunks
53
+ name_chunks = [name] * n_chunks
54
+
55
+ sections.extend(section_chunks)
56
+ urls.extend(url_chunks)
57
+ names.extend(name_chunks)
58
  else:
59
  sections.append(section)
60
+ urls.append(url)
61
+ names.append(name)
62
 
63
+ return sections, urls, names
64
 
65
  sections = []
66
+ urls = []
67
+ names = []
68
  for file in files:
69
  filepath = os.path.join(root_dir, file)
70
  with open(filepath, "r") as file:
71
  source = file.read()
72
 
73
  soup = BeautifulSoup(source, "html.parser")
74
+ sections_file, urls_file, names_file = get_all_subsections(soup)
75
+ sections.extend(sections_file)
76
 
77
+ urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
78
+ urls.extend(urls_file)
79
 
80
+ names.extend(names_file)
81
 
82
+ documents_df = pd.DataFrame.from_dict({"name": names, "url": urls, "text": sections})
 
 
83
 
84
+ return documents_df
85
 
 
 
 
86
 
87
+ def write_documents(filepath: str, documents_df: pd.DataFrame):
88
+ documents_df.to_csv(filepath, index=False)
89
 
90
 
91
+ def read_documents(filepath: str) -> pd.DataFrame:
92
+ return pd.read_csv(filepath)
 
 
 
 
 
93
 
94
 
95
  def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
96
  encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
97
+ df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))
98
  return df
99
 
100
 
101
  def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
102
+ df["embedding"] = df.text.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
103
  return df
104
 
105
 
106
  def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
107
  # Get all documents and precompute their embeddings
108
+ df = read_documents(filepath)
109
  df = compute_n_tokens(df)
110
  df = precompute_embeddings(df)
111
+ write_documents(output_csv, df)
112
  return df
113
 
114
 
115
  if __name__ == "__main__":
116
  root_dir = "/home/hadrien/perso/mila-docs/output/"
117
+ save_filepath = "data/documents.csv"
118
 
119
  # How to write
120
+ documents_df = get_all_documents(root_dir)
121
+ write_documents(save_filepath, documents_df)
122
 
123
  # How to load
124
+ documents_df = read_documents(save_filepath)
125
 
126
+ # precompute the document embeddings
127
  df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- pandas
2
- openai
3
  numpy
4
  tiktoken
 
 
 
1
+ bs4
 
2
  numpy
3
  tiktoken
4
+ openai
5
+ pandas