Spaces:

jerpint
/

buster

Running

App Files Files Community

hbertrand commited on Jan 25, 2023

Commit

0b4f7e4

•

1 Parent(s): 90ae9dd

parsing names

Browse files

Files changed (2) hide show

buster/docparser.py +49 -35
requirements.txt +0 -3

buster/docparser.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import glob
 import os
-import pickle
 import pandas as pd
 import tiktoken
 from bs4 import BeautifulSoup
 from openai.embeddings_utils import cosine_similarity, get_embedding
 EMBEDDING_MODEL = "text-embedding-ada-002"
 EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
@@ -14,7 +15,7 @@ EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-0
 BASE_URL = "https://docs.mila.quebec/"
-def get_all_sections(root_dir: str, max_section_length: int = 3000) -> tuple[list[str], list[str]]:
     """Parse all HTML files in `root_dir`, and extract all sections.
     Sections are broken into subsections if they are longer than `max_section_length`.
@@ -22,66 +23,78 @@ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> tuple[lis
     """
     files = glob.glob("*.html", root_dir=root_dir)
-    # Recurse until sections are small enough
-    def get_all_subsections(soup: BeautifulSoup, level: int) -> tuple[list[str], list[str]]:
-        if level >= 5:
-            return [], []
         found = soup.find_all('a', href=True, class_="headerlink")
         sections = []
         urls = []
         for section_found in found:
             section_soup = section_found.parent.parent
-            section = section_soup.text
             url = section_found['href']
             if len(section) > max_section_length:
-                s, u = get_all_subsections(section_soup, level + 1)
-                sections.extend(s)
-                urls.extend(u)
             else:
                 sections.append(section)
                 urls.append(url)
-        return sections, urls
     sections = []
     urls = []
     for file in files:
         filepath = os.path.join(root_dir, file)
         with open(filepath, "r") as file:
             source = file.read()
         soup = BeautifulSoup(source, "html.parser")
-        sections_file, urls_file = get_all_subsections(soup, 2)
         sections.extend(sections_file)
         urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
         urls.extend(urls_file)
-    return sections, urls
-def write_sections(filepath: str, sections: list[str]):
-    with open(filepath, "wb") as f:
-        pickle.dump(sections, f)
-def read_sections(filepath: str) -> list[str]:
-    with open(filepath, "rb") as fp:
-        sections = pickle.load(fp)
-    return sections
-def load_documents(fname: str) -> pd.DataFrame:
-    df = pd.DataFrame()
-    with open(fname, "rb") as fp:
-        documents = pickle.load(fp)
-    df["documents"] = documents
-    return df
 def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
@@ -97,7 +110,7 @@ def precompute_embeddings(df: pd.DataFrame) -> pd.DataFrame:
 def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
     # Get all documents and precompute their embeddings
-    df = load_documents(filepath)
     df = compute_n_tokens(df)
     df = precompute_embeddings(df)
     df.to_csv(output_csv)
@@ -106,14 +119,15 @@ def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
 if __name__ == "__main__":
     root_dir = "/home/hadrien/perso/mila-docs/output/"
-    save_filepath = os.path.join(root_dir, "sections.pkl")
     # How to write
-    sections = get_all_sections(root_dir)
-    write_sections(save_filepath, sections)
     # How to load
-    sections = read_sections(save_filepath)
-    # precopmute the document embeddings
     df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")

 import glob
+import math
 import os
 import pandas as pd
 import tiktoken
 from bs4 import BeautifulSoup
 from openai.embeddings_utils import cosine_similarity, get_embedding
 EMBEDDING_MODEL = "text-embedding-ada-002"
 EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
 BASE_URL = "https://docs.mila.quebec/"
+def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataFrame:
     """Parse all HTML files in `root_dir`, and extract all sections.
     Sections are broken into subsections if they are longer than `max_section_length`.
     """
     files = glob.glob("*.html", root_dir=root_dir)
+    def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
         found = soup.find_all('a', href=True, class_="headerlink")
         sections = []
         urls = []
+        names = []
         for section_found in found:
             section_soup = section_found.parent.parent
+            section_href = section_soup.find_all('a', href=True, class_="headerlink")
+            # If sections has subsections, keep only the part before the first subsection
+            if len(section_href) > 1:
+                section_siblings = section_soup.section.previous_siblings
+                section = [sibling.text for sibling in section_siblings]
+                section = ''.join(section[::-1])[1:]
+            else:
+                section = section_soup.text[1:]
             url = section_found['href']
+            name = section_found.parent.text[:-1]
+            # If text is too long, split into chunks of equal sizes
             if len(section) > max_section_length:
+                n_chunks = math.ceil(len(section) / float(max_section_length))
+                separator_index = math.floor(len(section) / n_chunks)
+                section_chunks = [section[separator_index * i: separator_index * (i + 1)] for i in range(n_chunks)]
+                url_chunks = [url] * n_chunks
+                name_chunks = [name] * n_chunks
+                sections.extend(section_chunks)
+                urls.extend(url_chunks)
+                names.extend(name_chunks)
             else:
                 sections.append(section)
                 urls.append(url)
+                names.append(name)
+        return sections, urls, names
     sections = []
     urls = []
+    names = []
     for file in files:
         filepath = os.path.join(root_dir, file)
         with open(filepath, "r") as file:
             source = file.read()
         soup = BeautifulSoup(source, "html.parser")
+        sections_file, urls_file, names_file = get_all_subsections(soup)
         sections.extend(sections_file)
         urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
         urls.extend(urls_file)
+        names.extend(names_file)
+    documents_df = pd.DataFrame.from_dict({
+        'name': names,
+        'url': urls,
+        'text': sections
+    })
+    return documents_df
+def write_documents(filepath: str, documents_df: pd.DataFrame):
+    documents_df.to_csv(filepath)
+def read_documents(filepath: str) -> pd.DataFrame:
+    return pd.read_csv(filepath)
 def compute_n_tokens(df: pd.DataFrame) -> pd.DataFrame:
 def generate_embeddings(filepath: str, output_csv: str) -> pd.DataFrame:
     # Get all documents and precompute their embeddings
+    df = read_documents(filepath)['text']
     df = compute_n_tokens(df)
     df = precompute_embeddings(df)
     df.to_csv(output_csv)
 if __name__ == "__main__":
     root_dir = "/home/hadrien/perso/mila-docs/output/"
+    save_filepath = os.path.join(root_dir, "documents.csv")
     # How to write
+    documents_df = get_all_documents(root_dir)
+    write_documents(save_filepath, documents_df)
     # How to load
+    documents_df = read_documents(save_filepath)
+    # precompute the document embeddings
     df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")

requirements.txt CHANGED Viewed

@@ -1,8 +1,5 @@
 bs4
 numpy
-<<<<<<< HEAD
 tiktoken
-=======
 openai
 pandas
->>>>>>> fe2ece9 (parsing urls)

 bs4
 numpy
 tiktoken
 openai
 pandas