Spaces:
Runtime error
Runtime error
import glob | |
import logging | |
import os | |
from pathlib import Path | |
from typing import Type | |
import click | |
import numpy as np | |
import pandas as pd | |
import tiktoken | |
from bs4 import BeautifulSoup | |
from openai.embeddings_utils import get_embedding | |
from buster.parser import HuggingfaceParser, Parser, SphinxParser | |
from buster.utils import get_documents_manager_from_extension | |
logger = logging.getLogger(__name__) | |
logging.basicConfig(level=logging.INFO) | |
EMBEDDING_MODEL = "text-embedding-ada-002" | |
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002 | |
supported_docs = { | |
"mila": { | |
"base_url": "https://docs.mila.quebec/", | |
"filename": "documents_mila.csv", | |
"parser": SphinxParser, | |
}, | |
"orion": { | |
"base_url": "https://orion.readthedocs.io/en/stable/", | |
"filename": "documents_orion.csv", | |
"parser": SphinxParser, | |
}, | |
"pytorch": { | |
"base_url": "https://pytorch.org/docs/stable/", | |
"filename": "documents_pytorch.csv", | |
"parser": SphinxParser, | |
}, | |
"huggingface": { | |
"base_url": "https://huggingface.co/docs/transformers/", | |
"filename": "documents_huggingface.csv", | |
"parser": HuggingfaceParser, | |
}, | |
"lightning": { | |
"base_url": "https://pytorch-lightning.readthedocs.io/en/stable/", | |
"filename": "documents_lightning.csv", | |
"parser": SphinxParser, | |
}, | |
"godot": { | |
"base_url": "https://docs.godotengine.org/en/stable/", | |
"filename": "documents_godot.csv", | |
"parser": SphinxParser, | |
}, | |
} | |
def get_document( | |
filepath: str, | |
base_url: str, | |
parser_cls: Type[Parser], | |
min_section_length: int = 100, | |
max_section_length: int = 2000, | |
) -> pd.DataFrame: | |
"""Extract all sections from one file. | |
Sections are broken into subsections if they are longer than `max_section_length`. | |
Sections correspond to `section` HTML tags that have a headerlink attached. | |
""" | |
with open(filepath, "r") as f: | |
source = f.read() | |
filename = Path(filepath).name | |
soup = BeautifulSoup(source, "html.parser") | |
parser = parser_cls(soup, base_url, filename, min_section_length, max_section_length) | |
sections = [] | |
urls = [] | |
names = [] | |
for section in parser.parse(): | |
sections.append(section.text) | |
urls.append(section.url) | |
names.append(section.name) | |
documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections}) | |
return documents_df | |
def get_all_documents( | |
root_dir: str, | |
base_url: str, | |
parser_cls: Type[Parser], | |
min_section_length: int = 100, | |
max_section_length: int = 2000, | |
) -> pd.DataFrame: | |
"""Parse all HTML files in `root_dir`, and extract all sections. | |
Sections are broken into subsections if they are longer than `max_section_length`. | |
Sections correspond to `section` HTML tags that have a headerlink attached. | |
""" | |
files = glob.glob("**/*.html", root_dir=root_dir, recursive=True) | |
dfs = [] | |
for file in files: | |
filepath = os.path.join(root_dir, file) | |
df = get_document(filepath, base_url, parser_cls, min_section_length, max_section_length) | |
dfs.append(df) | |
documents_df = pd.concat(dfs, ignore_index=True) | |
return documents_df | |
def compute_n_tokens( | |
df: pd.DataFrame, embedding_encoding: str = EMBEDDING_ENCODING, col: str = "content" | |
) -> pd.DataFrame: | |
"""Counts the tokens in the content column and adds the count to a n_tokens column.""" | |
logger.info("Computing tokens counts...") | |
encoding = tiktoken.get_encoding(encoding_name=embedding_encoding) | |
# TODO are there unexpected consequences of allowing endoftext? | |
df["n_tokens"] = df[col].apply(lambda x: len(encoding.encode(x, allowed_special={"<|endoftext|>"}))) | |
return df | |
def max_word_count(df: pd.DataFrame, max_words: int, col: str = "content") -> pd.DataFrame: | |
"""Trim the word count of an entry to max_words""" | |
assert df[col].apply(lambda s: isinstance(s, str)).all(), f"Column {col} must contain only strings" | |
word_counts_before = df[col].apply(lambda x: len(x.split())) | |
df[col] = df[col].apply(lambda x: " ".join(x.split()[:max_words])) | |
word_counts_after = df[col].apply(lambda x: len(x.split())) | |
trimmed = df[word_counts_before == word_counts_after] | |
logger.info(f"trimmed {len(trimmed)} documents to {max_words} words.") | |
return df | |
def compute_embeddings(df: pd.DataFrame, engine: str = EMBEDDING_MODEL, col="embedding") -> pd.DataFrame: | |
logger.info(f"Computing embeddings for {len(df)} documents...") | |
df[col] = df.content.apply(lambda x: np.asarray(get_embedding(x, engine=engine), dtype=np.float32)) | |
logger.info(f"Done computing embeddings for {len(df)} documents.") | |
return df | |
def generate_embeddings_parser(root_dir: str, output_filepath: str, source: str) -> pd.DataFrame: | |
documents = get_all_documents(root_dir, supported_docs[source]["base_url"], supported_docs[source]["parser"]) | |
return generate_embeddings(documents, output_filepath) | |
def documents_to_db(documents: pd.DataFrame, output_filepath: str): | |
logger.info("Preparing database...") | |
documents_manager = get_documents_manager_from_extension(output_filepath)(output_filepath) | |
sources = documents["source"].unique() | |
for source in sources: | |
documents_manager.add(source, documents) | |
logger.info(f"Documents saved to: {output_filepath}") | |
def update_source(source: str, output_filepath: str, display_name: str = None, note: str = None): | |
documents_manager = get_documents_manager_from_extension(output_filepath)(output_filepath) | |
documents_manager.update_source(source, display_name, note) | |
def generate_embeddings( | |
documents: pd.DataFrame, | |
output_filepath: str = "documents.db", | |
max_words=500, | |
embedding_engine: str = EMBEDDING_MODEL, | |
) -> pd.DataFrame: | |
# check that we have the appropriate columns in our dataframe | |
assert set(required_cols := ["content", "title", "url"]).issubset( | |
set(documents.columns) | |
), f"Your dataframe must contain {required_cols}." | |
# Get all documents and precompute their embeddings | |
documents = max_word_count(documents, max_words=max_words) | |
documents = compute_n_tokens(documents) | |
documents = compute_embeddings(documents, engine=embedding_engine) | |
# save the documents to a db for later use | |
documents_to_db(documents, output_filepath) | |
return documents | |
def main(documents_csv: str, output_filepath: str, max_words: int, embeddings_engine: str): | |
documents = pd.read_csv(documents_csv) | |
documents = generate_embeddings(documents, output_filepath, max_words, embeddings_engine) | |
if __name__ == "__main__": | |
main() | |