Spaces:

jerpint
/

buster

Running

App Files Files Community

hbertrand commited on Jan 25, 2023

Commit

eec81fa

•

1 Parent(s): 05dabf4

formatting

Browse files

Files changed (3) hide show

.gitignore +137 -0
buster/chatbot.py +1 -2
buster/docparser.py +6 -12

.gitignore ADDED Viewed

	@@ -0,0 +1,137 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+albenchmark/data/
+# Ignore notebooks by default
+*.ipynb
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# VSCode
+.vscode/

buster/chatbot.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import logging
-import pickle
 import numpy as np
 import openai
 import pandas as pd
-from buster.docparser import EMBEDDING_MODEL
 from openai.embeddings_utils import cosine_similarity, get_embedding
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)

 import logging
 import numpy as np
 import openai
 import pandas as pd
 from openai.embeddings_utils import cosine_similarity, get_embedding
+from buster.docparser import EMBEDDING_MODEL
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)

buster/docparser.py CHANGED Viewed

@@ -7,7 +7,6 @@ import tiktoken
 from bs4 import BeautifulSoup
 from openai.embeddings_utils import get_embedding
 EMBEDDING_MODEL = "text-embedding-ada-002"
 EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
@@ -24,24 +23,24 @@ def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataF
     files = glob.glob("*.html", root_dir=root_dir)
     def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
-        found = soup.find_all('a', href=True, class_="headerlink")
         sections = []
         urls = []
         names = []
         for section_found in found:
             section_soup = section_found.parent.parent
-            section_href = section_soup.find_all('a', href=True, class_="headerlink")
             # If sections has subsections, keep only the part before the first subsection
             if len(section_href) > 1:
                 section_siblings = section_soup.section.previous_siblings
                 section = [sibling.text for sibling in section_siblings]
-                section = ''.join(section[::-1])[1:]
             else:
                 section = section_soup.text[1:]
-            url = section_found['href']
             name = section_found.parent.text[:-1]
             # If text is too long, split into chunks of equal sizes
@@ -49,7 +48,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataF
                 n_chunks = math.ceil(len(section) / float(max_section_length))
                 separator_index = math.floor(len(section) / n_chunks)
-                section_chunks = [section[separator_index * i: separator_index * (i + 1)] for i in range(n_chunks)]
                 url_chunks = [url] * n_chunks
                 name_chunks = [name] * n_chunks
@@ -80,11 +79,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataF
         names.extend(names_file)
-    documents_df = pd.DataFrame.from_dict({
-        'name': names,
-        'url': urls,
-        'text': sections
-    })
     return documents_df
@@ -130,4 +125,3 @@ if __name__ == "__main__":
     # precompute the document embeddings
     df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")

 from bs4 import BeautifulSoup
 from openai.embeddings_utils import get_embedding
 EMBEDDING_MODEL = "text-embedding-ada-002"
 EMBEDDING_ENCODING = "cl100k_base"  # this the encoding for text-embedding-ada-002
     files = glob.glob("*.html", root_dir=root_dir)
     def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
+        found = soup.find_all("a", href=True, class_="headerlink")
         sections = []
         urls = []
         names = []
         for section_found in found:
             section_soup = section_found.parent.parent
+            section_href = section_soup.find_all("a", href=True, class_="headerlink")
             # If sections has subsections, keep only the part before the first subsection
             if len(section_href) > 1:
                 section_siblings = section_soup.section.previous_siblings
                 section = [sibling.text for sibling in section_siblings]
+                section = "".join(section[::-1])[1:]
             else:
                 section = section_soup.text[1:]
+            url = section_found["href"]
             name = section_found.parent.text[:-1]
             # If text is too long, split into chunks of equal sizes
                 n_chunks = math.ceil(len(section) / float(max_section_length))
                 separator_index = math.floor(len(section) / n_chunks)
+                section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
                 url_chunks = [url] * n_chunks
                 name_chunks = [name] * n_chunks
         names.extend(names_file)
+    documents_df = pd.DataFrame.from_dict({"name": names, "url": urls, "text": sections})
     return documents_df
     # precompute the document embeddings
     df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")