hbertrand commited on
Commit
eec81fa
1 Parent(s): 05dabf4

formatting

Browse files
Files changed (3) hide show
  1. .gitignore +137 -0
  2. buster/chatbot.py +1 -2
  3. buster/docparser.py +6 -12
.gitignore ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ albenchmark/data/
7
+
8
+ # Ignore notebooks by default
9
+ *.ipynb
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ pip-wheel-metadata/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100
+ __pypackages__/
101
+
102
+ # Celery stuff
103
+ celerybeat-schedule
104
+ celerybeat.pid
105
+
106
+ # SageMath parsed files
107
+ *.sage.py
108
+
109
+ # Environments
110
+ .env
111
+ .venv
112
+ env/
113
+ venv/
114
+ ENV/
115
+ env.bak/
116
+ venv.bak/
117
+
118
+ # Spyder project settings
119
+ .spyderproject
120
+ .spyproject
121
+
122
+ # Rope project settings
123
+ .ropeproject
124
+
125
+ # mkdocs documentation
126
+ /site
127
+
128
+ # mypy
129
+ .mypy_cache/
130
+ .dmypy.json
131
+ dmypy.json
132
+
133
+ # Pyre type checker
134
+ .pyre/
135
+
136
+ # VSCode
137
+ .vscode/
buster/chatbot.py CHANGED
@@ -1,12 +1,11 @@
1
  import logging
2
- import pickle
3
 
4
  import numpy as np
5
  import openai
6
  import pandas as pd
7
- from buster.docparser import EMBEDDING_MODEL
8
  from openai.embeddings_utils import cosine_similarity, get_embedding
9
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
  logging.basicConfig(level=logging.INFO)
 
1
  import logging
 
2
 
3
  import numpy as np
4
  import openai
5
  import pandas as pd
 
6
  from openai.embeddings_utils import cosine_similarity, get_embedding
7
 
8
+ from buster.docparser import EMBEDDING_MODEL
9
 
10
  logger = logging.getLogger(__name__)
11
  logging.basicConfig(level=logging.INFO)
buster/docparser.py CHANGED
@@ -7,7 +7,6 @@ import tiktoken
7
  from bs4 import BeautifulSoup
8
  from openai.embeddings_utils import get_embedding
9
 
10
-
11
  EMBEDDING_MODEL = "text-embedding-ada-002"
12
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
13
 
@@ -24,24 +23,24 @@ def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataF
24
  files = glob.glob("*.html", root_dir=root_dir)
25
 
26
  def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
27
- found = soup.find_all('a', href=True, class_="headerlink")
28
 
29
  sections = []
30
  urls = []
31
  names = []
32
  for section_found in found:
33
  section_soup = section_found.parent.parent
34
- section_href = section_soup.find_all('a', href=True, class_="headerlink")
35
 
36
  # If sections has subsections, keep only the part before the first subsection
37
  if len(section_href) > 1:
38
  section_siblings = section_soup.section.previous_siblings
39
  section = [sibling.text for sibling in section_siblings]
40
- section = ''.join(section[::-1])[1:]
41
  else:
42
  section = section_soup.text[1:]
43
 
44
- url = section_found['href']
45
  name = section_found.parent.text[:-1]
46
 
47
  # If text is too long, split into chunks of equal sizes
@@ -49,7 +48,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataF
49
  n_chunks = math.ceil(len(section) / float(max_section_length))
50
  separator_index = math.floor(len(section) / n_chunks)
51
 
52
- section_chunks = [section[separator_index * i: separator_index * (i + 1)] for i in range(n_chunks)]
53
  url_chunks = [url] * n_chunks
54
  name_chunks = [name] * n_chunks
55
 
@@ -80,11 +79,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 3000) -> pd.DataF
80
 
81
  names.extend(names_file)
82
 
83
- documents_df = pd.DataFrame.from_dict({
84
- 'name': names,
85
- 'url': urls,
86
- 'text': sections
87
- })
88
 
89
  return documents_df
90
 
@@ -130,4 +125,3 @@ if __name__ == "__main__":
130
 
131
  # precompute the document embeddings
132
  df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")
133
-
 
7
  from bs4 import BeautifulSoup
8
  from openai.embeddings_utils import get_embedding
9
 
 
10
  EMBEDDING_MODEL = "text-embedding-ada-002"
11
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
12
 
 
23
  files = glob.glob("*.html", root_dir=root_dir)
24
 
25
  def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
26
+ found = soup.find_all("a", href=True, class_="headerlink")
27
 
28
  sections = []
29
  urls = []
30
  names = []
31
  for section_found in found:
32
  section_soup = section_found.parent.parent
33
+ section_href = section_soup.find_all("a", href=True, class_="headerlink")
34
 
35
  # If sections has subsections, keep only the part before the first subsection
36
  if len(section_href) > 1:
37
  section_siblings = section_soup.section.previous_siblings
38
  section = [sibling.text for sibling in section_siblings]
39
+ section = "".join(section[::-1])[1:]
40
  else:
41
  section = section_soup.text[1:]
42
 
43
+ url = section_found["href"]
44
  name = section_found.parent.text[:-1]
45
 
46
  # If text is too long, split into chunks of equal sizes
 
48
  n_chunks = math.ceil(len(section) / float(max_section_length))
49
  separator_index = math.floor(len(section) / n_chunks)
50
 
51
+ section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
52
  url_chunks = [url] * n_chunks
53
  name_chunks = [name] * n_chunks
54
 
 
79
 
80
  names.extend(names_file)
81
 
82
+ documents_df = pd.DataFrame.from_dict({"name": names, "url": urls, "text": sections})
 
 
 
 
83
 
84
  return documents_df
85
 
 
125
 
126
  # precompute the document embeddings
127
  df = generate_embeddings(filepath=save_filepath, output_csv="data/document_embeddings.csv")