Spaces:

jerpint
/

buster

Running

App Files Files Community

jerpint commited on Jan 25, 2023

Commit

f988598

•

2 Parent(s): b140ffc eec81fa

Merge pull request #5 from jerpint/parse_docs

Browse files

Files changed (6) hide show

.gitignore +137 -0
buster/chatbot.py +4 -3
buster/data/document_embeddings.csv +0 -0
buster/data/{sections.pkl → documents.csv} +0 -0
buster/docparser.py +61 -37
requirements.txt +3 -2

.gitignore ADDED Viewed

	@@ -0,0 +1,137 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+albenchmark/data/
+# Ignore notebooks by default
+*.ipynb
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# VSCode
+.vscode/

buster/chatbot.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import logging
-import pickle
 import numpy as np
 import openai
 import pandas as pd
-from docparser import EMBEDDING_MODEL
 from openai.embeddings_utils import cosine_similarity, get_embedding
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 # search through the reviews for a specific product
 def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame:
     product_embedding = get_embedding(
@@ -33,7 +34,7 @@ def engineer_prompt(question: str, documents: list[str]) -> str:
 def get_gpt_response(question: str, df) -> str:
     # rank the documents, get the highest scoring doc and generate the prompt
     candidates = rank_documents(df, query=question, top_k=1)
-    documents = candidates.documents.to_list()
     prompt = engineer_prompt(question, documents)
     logger.info(f"querying GPT...")

 import logging
 import numpy as np
 import openai
 import pandas as pd
 from openai.embeddings_utils import cosine_similarity, get_embedding
+from buster.docparser import EMBEDDING_MODEL
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
 # search through the reviews for a specific product
 def rank_documents(df: pd.DataFrame, query: str, top_k: int = 3) -> pd.DataFrame:
     product_embedding = get_embedding(
 def get_gpt_response(question: str, df) -> str:
     # rank the documents, get the highest scoring doc and generate the prompt
     candidates = rank_documents(df, query=question, top_k=1)
+    documents = candidates.text.to_list()
     prompt = engineer_prompt(question, documents)
     logger.info(f"querying GPT...")

buster/data/document_embeddings.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

buster/data/{sections.pkl → documents.csv} RENAMED Viewed

Binary files a/buster/data/sections.pkl and b/buster/data/documents.csv differ

buster/docparser.py CHANGED Viewed

@@ -1,17 +1,20 @@
 import glob
 import os
-import pickle
 import pandas as pd
 import tiktoken
 from bs4 import BeautifulSoup
-from openai.embeddings_utils import cosine_similarity, get_embedding
 EMBEDDING_MODEL = "text-embedding-ada-002"
 EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
-def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
     """Parse all HTML files in `root_dir`, and extract all sections.
     Sections are broken into subsections if they are longer than `max_section_length`.
@@ -19,85 +22,106 @@ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]
     """
     files = glob.glob("*.html", root_dir=root_dir)
-    selector = "section > section"
-    # Recurse until sections are small enough
-    def get_all_subsections(soup, selector: str) -> list[str]:
-        found = soup.select(selector)
-        data = [x.text.split(";")[-1].strip() for x in found]
         sections = []
-        for i, section in enumerate(data):
             if len(section) > max_section_length:
-                sections.extend(get_all_subsections(found[i], selector + " > section"))
             else:
                 sections.append(section)
-        return sections
     sections = []
     for file in files:
         filepath = os.path.join(root_dir, file)
         with open(filepath, "r") as file:
             source = file.read()
         soup = BeautifulSoup(source, "html.parser")
-        sections.extend(get_all_subsections(soup, selector))
-    return sections
-def write_sections(filepath: str, sections: list[str]):
-    with open(filepath, "wb") as f:
-        pickle.dump(sections, f)
-def read_sections(filepath: str) -> list[str]:
-    with open(filepath, "rb") as fp:
-        sections = pickle.load(fp)
-    return sections
-def load_documents(fname: str) -> pd.DataFrame:
-    df = pd.DataFrame()
-    with open(fname, "rb") as fp:
-        documents = pickle.load(fp)
-    df["documents"] = documents
-    return df
 def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
     encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
-    df["n_tokens"] = df.documents.apply(lambda x: len(encoding.encode(x)))
     return df
 def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
-    df["embedding"] = df.documents.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
     return df
 def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
     # Get all documents and precompute their embeddings
-    df = load_documents(filepath)
     df = compute_n_tokens(df)
     df = precompute_embeddings(df)
-    df.to_csv(output_csv)
     return df
 if __name__ == "__main__":
     root_dir = "/home/hadrien/perso/mila-docs/output/"
-    save_filepath = os.path.join(root_dir, "sections.pkl")
     # How to write
-    sections = get_all_sections(root_dir)
-    write_sections(save_filepath, sections)
     # How to load
-    sections = read_sections(save_filepath)
-    # precopmute the document embeddings
     df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")

 import glob
+import math
 import os
 import pandas as pd
 import tiktoken
 from bs4 import BeautifulSoup
+from openai.embeddings_utils import get_embedding
 EMBEDDING_MODEL = "text-embedding-ada-002"
 EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
+BASE_URL = "https://docs.mila.quebec/"
+def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataFrame:
     """Parse all HTML files in `root_dir`, and extract all sections.
     Sections are broken into subsections if they are longer than `max_section_length`.
     """
     files = glob.glob("*.html", root_dir=root_dir)
+    def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
+        found = soup.find_all("a", href=True, class_="headerlink")
         sections = []
+        urls = []
+        names = []
+        for section_found in found:
+            section_soup = section_found.parent.parent
+            section_href = section_soup.find_all("a", href=True, class_="headerlink")
+            # If sections has subsections, keep only the part before the first subsection
+            if len(section_href) > 1:
+                section_siblings = section_soup.section.previous_siblings
+                section = [sibling.text for sibling in section_siblings]
+                section = "".join(section[::-1])[1:]
+            else:
+                section = section_soup.text[1:]
+            url = section_found["href"]
+            name = section_found.parent.text[:-1]
+            # If text is too long, split into chunks of equal sizes
             if len(section) > max_section_length:
+                n_chunks = math.ceil(len(section) / float(max_section_length))
+                separator_index = math.floor(len(section) / n_chunks)
+                section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
+                url_chunks = [url] * n_chunks
+                name_chunks = [name] * n_chunks
+                sections.extend(section_chunks)
+                urls.extend(url_chunks)
+                names.extend(name_chunks)
             else:
                 sections.append(section)
+                urls.append(url)
+                names.append(name)
+        return sections, urls, names
     sections = []
+    urls = []
+    names = []
     for file in files:
         filepath = os.path.join(root_dir, file)
         with open(filepath, "r") as file:
             source = file.read()
         soup = BeautifulSoup(source, "html.parser")
+        sections_file, urls_file, names_file = get_all_subsections(soup)
+        sections.extend(sections_file)
+        urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
+        urls.extend(urls_file)
+        names.extend(names_file)
+    documents_df = pd.DataFrame.from_dict({"name": names, "url": urls, "text": sections})
+    return documents_df
+def write_documents(filepath: str, documents_df: pd.DataFrame):
+    documents_df.to_csv(filepath, index=False)
+def read_documents(filepath: str) -> pd.DataFrame:
+    return pd.read_csv(filepath)
 def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
     encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
+    df["n_tokens"] = df.text.apply(lambda x: len(encoding.encode(x)))
     return df
 def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
+    df["embedding"] = df.text.apply(lambda x: get_embedding(x, engine=EMBEDDING_MODEL))
     return df
 def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
     # Get all documents and precompute their embeddings
+    df = read_documents(filepath)
     df = compute_n_tokens(df)
     df = precompute_embeddings(df)
+    write_documents(output_csv, df)
     return df
 if __name__ == "__main__":
     root_dir = "/home/hadrien/perso/mila-docs/output/"
+    save_filepath = "data/documents.csv"
     # How to write
+    documents_df = get_all_documents(root_dir)
+    write_documents(save_filepath, documents_df)
     # How to load
+    documents_df = read_documents(save_filepath)
+    # precompute the document embeddings
     df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
-pandas
-openai
 numpy
 tiktoken

+bs4
 numpy
 tiktoken
+openai
+pandas