caldervf commited on
Commit
74c716c
1 Parent(s): 8af67d3

Adding files from Github repository.

Browse files
Files changed (50) hide show
  1. .envrc +15 -0
  2. .gitattributes +3 -0
  3. .github/workflows/code-linting.yml +22 -0
  4. .gitignore +168 -0
  5. .isort.cfg +5 -0
  6. .pre-commit-config.yaml +57 -0
  7. .python-version +1 -0
  8. .tmuxgo +32 -0
  9. Dockerfile +104 -0
  10. LICENSE +21 -0
  11. LICENSE.rst +21 -0
  12. Makefile +485 -0
  13. README.md +282 -11
  14. data/cicero_faiss_index.faiss +3 -0
  15. data/clean_dataset.csv +3 -0
  16. data/raw_dataset.csv +3 -0
  17. docker/aliases.sh +20 -0
  18. docker/docker-compose.yaml +71 -0
  19. pyproject.toml +34 -0
  20. requirements-deploy.txt +1 -0
  21. requirements-dev.txt +12 -0
  22. requirements.txt +14 -0
  23. src/.DS_Store +0 -0
  24. src/api/__init__.py +21 -0
  25. src/api/index.py +182 -0
  26. src/app_service/__init__.py +21 -0
  27. src/app_service/app.py +167 -0
  28. src/classes/__init__.py +21 -0
  29. src/classes/__pycache__/__init__.cpython-39.pyc +0 -0
  30. src/classes/__pycache__/hugging_face_utils.cpython-39.pyc +0 -0
  31. src/classes/__pycache__/semantic_search_engine.cpython-39.pyc +0 -0
  32. src/classes/data_preparation.py +403 -0
  33. src/classes/hugging_face_utils.py +223 -0
  34. src/classes/semantic_search_engine.py +249 -0
  35. src/data_processing/__init__.py +21 -0
  36. src/data_processing/prepare_dataset.py +196 -0
  37. src/focused_summary_example.py +20 -0
  38. src/training/__init__.py +21 -0
  39. src/training/create_faiss_corpus_index.py +209 -0
  40. src/utils/.DS_Store +0 -0
  41. src/utils/__init__.py +21 -0
  42. src/utils/__pycache__/__init__.cpython-39.pyc +0 -0
  43. src/utils/__pycache__/default_variables.cpython-39.pyc +0 -0
  44. src/utils/default_variables.py +76 -0
  45. src/utils/general_utilities.py +181 -0
  46. src/utils/gpt35_summaries/__init__.py +0 -0
  47. src/utils/gpt35_summaries/cleanup_and_summarize.py +107 -0
  48. src/utils/gpt35_summaries/html_tags.txt +109 -0
  49. src/utils/gpt35_summaries/summarizer.py +157 -0
  50. template.envrc +15 -0
.envrc ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -------------------- Defining default environment ---------------------------
2
+
3
+ # --- Docker BuildKit
4
+ export DOCKER_BUILDKIT_VALUE=1
5
+
6
+ # --- Project variables
7
+ export INPUT_APP_PORT=8501
8
+ export OUTPUT_APP_PORT=8501
9
+ export APP_SERVER_PORT=7860
10
+
11
+ export HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
12
+ export HUGGING_FACE_USERNAME=${HUGGING_FACE_USERNAME}
13
+
14
+ export PATH="${PWD}:${PATH}"
15
+ export PYTHONPATH="${PWD}:${PYTHONPATH}"
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/cicero_faiss_index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ data/clean_dataset.csv filter=lfs diff=lfs merge=lfs -text
38
+ data/raw_dataset.csv filter=lfs diff=lfs merge=lfs -text
.github/workflows/code-linting.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Project CICD
2
+ run-name: ${{ github.actor }} - CICD
3
+ on: [push]
4
+
5
+ jobs:
6
+ #
7
+ # --- Code-linting
8
+ lint-code:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ # Checkout repository
12
+ - uses: actions/checkout@v3
13
+ # Install Python
14
+ - uses: actions/setup-python@v4
15
+ with:
16
+ python-version: "3.9"
17
+ # Install python dependencies
18
+ - name: Install dependencies
19
+ run: |
20
+ make requirements
21
+ make pre-commit-install
22
+ make lint
.gitignore ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ .envrc
163
+ .python-version
164
+ data/
165
+
166
+ src/utils/gpt35_summaries/df_embed.csv
167
+ src/utils/gpt35_summaries/df_embed_out2.csv
168
+ src/utils/gpt35_summaries/words_alpha.txt
.isort.cfg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [settings]
2
+ line_length = 79
3
+ multi_line_output = 3
4
+ include_trailing_comma = True
5
+ known_third_party =datasets,fastapi,gradio,huggingface_hub,numpy,pandas,pydantic,requests,sentence_transformers,spacy,tiktoken,torch,utils
.pre-commit-config.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created : 2023-03-01
2
+ # Last Modified : 2023-03-04
3
+ #
4
+ # Description
5
+ # This file summarizes the set of checks that pre-commit will perform
6
+ # prior to any commit.
7
+
8
+ default_stages: [commit, manual]
9
+
10
+ # Repositories to use
11
+ repos:
12
+ - repo: https://github.com/pre-commit/pre-commit-hooks
13
+ rev: v3.4.0
14
+ hooks:
15
+ - id: trailing-whitespace
16
+ - id: end-of-file-fixer
17
+ - id: check-yaml # Checks yaml files for parseable syntax.
18
+ - id: check-json # Checks json files for parseable syntax.
19
+ - id: check-added-large-files
20
+ - id: check-toml
21
+ - id: check-docstring-first
22
+ - id: check-case-conflict # Check for files that would conflict in case-insensitive filesystems
23
+ - id: check-merge-conflict # Check for files that contain merge conflict strings.
24
+ - id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source.
25
+ - repo: https://github.com/pycqa/flake8
26
+ rev: 5.0.4
27
+ hooks:
28
+ - id: flake8
29
+ language_version: python3.9
30
+ exclude: >
31
+ (?x)^(
32
+ src/focused_summary_example.py
33
+ )
34
+
35
+ - repo: https://github.com/ambv/black
36
+ rev: 22.3.0
37
+ hooks:
38
+ - id: black
39
+ language_version: python3.9
40
+
41
+ - repo: https://github.com/asottile/seed-isort-config
42
+ rev: v2.2.0
43
+ hooks:
44
+ - id: seed-isort-config
45
+
46
+ - repo: https://github.com/pycqa/isort
47
+ rev: 5.11.5
48
+ hooks:
49
+ - id: isort
50
+ name: isort (python)
51
+ exclude: hooks.py
52
+ - id: isort
53
+ name: isort (cython)
54
+ types: [cython]
55
+ - id: isort
56
+ name: isort (pyi)
57
+ types: [pyi]
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ Cicero_LLM_Synthesizer
.tmuxgo ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ SESSION=`basename $PWD`
3
+ ENVNAME='ml'
4
+
5
+ # Creating new session
6
+ tmux -2 new-session -d -s $SESSION `cd $PWD`
7
+
8
+ # ------ Main window ------
9
+ # Renaming window
10
+ tmux rename-window -t $SESSION:0 main
11
+ # Splitting panes and windows
12
+ tmux split-window -v
13
+ tmux select-pane -t 0
14
+ tmux resize-pane -D 15
15
+ tmux select-pane -t 1
16
+ # Sending commands
17
+ tmux send -t $SESSION:0.1 "cd $PWD; conda activate $ENVNAME; source $SHELL ;clear; htop" ENTER
18
+ #
19
+ # ------ Miscellaneous window ------
20
+ tmux new-window -t $SESSION:1 -n 'misc'
21
+ tmux send -t $SESSION:1.0 "cd $PWD; conda activate $ENVNAME; source $SHELL ; clear;" ENTER
22
+ # ------ Extras window ------
23
+ tmux new-window -t $SESSION:2 -n 'extras'
24
+ tmux send -t $SESSION:2.0 "cd $PWD; conda activate $ENVNAME; source $SHELL ; clear;" ENTER
25
+ # ------ Jupyter window ------
26
+ tmux new-window -t $SESSION:3 -n 'jupyter'
27
+ tmux send -t $SESSION:3.0 "cd $PWD; conda activate $ENVNAME; source $SHELL ; clear;" ENTER
28
+ #
29
+ # Selecting which window to start at
30
+ tmux select-window -t $SESSION:0
31
+ tmux select-pane -t 0
32
+ tmux -2 attach -t $SESSION
Dockerfile ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG PYTHON_VERSION="3.9.13"
2
+ ARG PLATFORM_NAME="linux/amd64"
3
+
4
+ FROM --platform=${PLATFORM_NAME} python:${PYTHON_VERSION}
5
+
6
+ # --- SYSTEM ARCHITECTURE
7
+ ARG TARGETPLATFORM
8
+ ARG TARGETARCH
9
+ ARG TARGETVARIANT
10
+
11
+ RUN printf "I'm building for TARGETPLATFORM=${TARGETPLATFORM}" \
12
+ && printf ", TARGETARCH=${TARGETARCH}" \
13
+ && printf ", TARGETVARIANT=${TARGETVARIANT} \n" \
14
+ && printf "With uname -s : " && uname -s \
15
+ && printf "and uname -m : " && uname -mm
16
+
17
+ # --- Environment variables
18
+ ENV REQUIREMENTS_FILE="requirements.txt"
19
+ ENV OUTDIR="/root"
20
+ ENV PROJECT_DIR="/opt/ml"
21
+ ENV PROGRAM_DIR="/opt/program"
22
+ ENV HOME_DIR="/root/ml"
23
+ ENV LOCAL_DEV_DIR="docker"
24
+ ENV ALIASES_FILE="/root/aliases.sh"
25
+ ENV DEBIAN_FRONTEND=noninteractive
26
+
27
+ # --- Dockerfile Metadata
28
+ LABEL Maintainer="Victor Calderon"
29
+
30
+ # ------------------------- COPYING AND DIRECTORIES ---------------------------
31
+
32
+ RUN mkdir -p ${HOME_DIR}
33
+
34
+ COPY ./src ${PROJECT_DIR}/src
35
+ COPY ${LOCAL_DEV_DIR}/aliases.sh ${ALIASES_FILE}
36
+
37
+ COPY ${REQUIREMENTS_FILE} "${HOME_DIR}/${REQUIREMENTS_FILE}"
38
+
39
+ # ---------------------- EXPOSING PORTS FOR APP -------------------------------
40
+
41
+ EXPOSE 7860
42
+ EXPOSE 8501
43
+
44
+ # --------------------- INSTALLING EXTRA PACKAGES -----------------------------
45
+ # --- Updating packages and installing packages at the system-level
46
+
47
+ RUN apt-get -y update && \
48
+ apt-get upgrade -y && \
49
+ apt-get clean && \
50
+ # Instaling system-level packages
51
+ apt-get install -y \
52
+ git \
53
+ ssh \
54
+ tree \
55
+ git-flow \
56
+ tmux \
57
+ direnv \
58
+ bash-completion \
59
+ zsh \
60
+ htop \
61
+ vim \
62
+ && \
63
+ # Cleaning out
64
+ rm -rf /var/lib/apt/lists/* && \
65
+ # Cleaning installs
66
+ apt-get clean && \
67
+ # Installing ZSH and OhZSH
68
+ sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" && \
69
+ echo "source /etc/profile.d/bash_completion.sh" >> /root/.bashrc && \
70
+ echo "source /etc/profile.d/bash_completion.sh" >> /root/.zshrc && \
71
+ echo "source /root/aliases.sh" >> "${OUTDIR}/.zshrc" && \
72
+ echo "source /root/aliases.sh" >> "${OUTDIR}/.bashrc" && \
73
+ # Install direnv
74
+ echo 'eval "$(direnv hook zsh)"' >> "${OUTDIR}/.zshrc" && \
75
+ echo 'eval "$(direnv hook bash)"' >> "${OUTDIR}/.bash"
76
+
77
+ # -------------------------- DOCKER-SPECIFIC ----------------------------------
78
+
79
+ RUN apt-get update -y && \
80
+ cd ${OUTDIR_DOCKER} && \
81
+ curl -fsSL https://get.docker.com -o get-docker.sh && sh get-docker.sh
82
+
83
+ # --------------------------- PYTHON-RELATED-LOCAL ----------------------------
84
+
85
+ RUN pip install --upgrade pip && \
86
+ python -m pip install -r "${HOME_DIR}/${REQUIREMENTS_FILE}"
87
+
88
+ # ----------------------------- PYTHON-SPECIFIC -------------------------------
89
+
90
+ # Set some environment variables. PYTHONUNBUFFERED keeps Python from
91
+ # buffering our standard output stream, which means that logs can be
92
+ # delivered to the user quickly. PYTHONDONTWRITEBYTECODE keeps Python
93
+ # from writing the .pyc files which are unnecessary in this case. We also
94
+ # update PATH so that the train and serve programs are found when the
95
+ # container is invoked.
96
+
97
+ ENV PYTHONUNBUFFERED=TRUE
98
+ ENV PYTHONDONTWRITEBYTECODE=TRUE
99
+ ENV PATH="${PROGRAM_DIR}:${PATH}"
100
+ ENV PYTHONPATH="${PROGRAM_DIR}:${PYTHONPATH}"
101
+
102
+ WORKDIR ${PROJECT_DIR}
103
+
104
+ CMD ["uvicorn", "src.api.index:app", "--host", "0.0.0.0","--port", "7860"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Maven-Building-LLMS-into-Production
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
LICENSE.rst ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Victor Calderon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Makefile ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: show-params
2
+
3
+ ###############################################################################
4
+ # GLOBALS #
5
+ ###############################################################################
6
+
7
+ PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
8
+ PROJECT_NAME := $(shell basename $(subst -,_,$(PROJECT_DIR)))
9
+ PROJECT_NAME_LOWER := $(shell echo $(PROJECT_NAME) | tr '[:upper:]' '[:lower:]')
10
+ ENVIRONMENT_NAME = $(PROJECT_NAME)
11
+ PYTHON_INTERPRETER = python3
12
+ PIP_INTERPRETER = pip
13
+ PYTHON_VERSION = 3.9
14
+ PIP_VERSION = 22.3
15
+
16
+ # --- REQUIREMENTS-RELATED
17
+ REQUIREMENTS_FILE = $(PROJECT_DIR)/requirements.txt
18
+ REQUIREMENTS_FILE_TEMP = $(PROJECT_DIR)/requirements.tmp
19
+ REQUIREMENTS_DEV_FILE = $(PROJECT_DIR)/requirements-dev.txt
20
+ REQUIREMENTS_DEV_FILE_TEMP = $(PROJECT_DIR)/requirements-dev.tmp
21
+ REQUIREMENTS_DEPLOYMENT_FILE = $(PROJECT_DIR)/requirements-deploy.txt
22
+ REQUIREMENTS_DEPLOYMENT_FILE_TEMP = $(PROJECT_DIR)/requirements-deploy.tmp
23
+
24
+ # --- PATHS TO PROJECT DIRECTOIRES
25
+ DATA_DIRECTORY = $(PROJECT_DIR)/data
26
+ SRC_DIRECTORY = $(PROJECT_DIR)/src
27
+ API_DIRECTORY = $(SRC_DIRECTORY)/api
28
+ DATA_PROCESSING_DIRECTORY = $(SRC_DIRECTORY)/data_processing
29
+ TRAINING_DIRECTORY = $(SRC_DIRECTORY)/training
30
+
31
+ # -- Docker-related
32
+ # Variable used for turning on/off Docker Buildkit
33
+ DOCKER_BUILDKIT_VALUE=1
34
+ LOCAL_DEVELOPMENT_DIR_PATH="$(PROJECT_DIR)/docker"
35
+
36
+ # -- API-related
37
+ INPUT_APP_PORT=8501
38
+ OUTPUT_APP_PORT=8501
39
+ API_WEBSERVER_URL="http://localhost:$(INPUT_APP_PORT)"
40
+
41
+ # -- App-related
42
+ APP_SERVER_PORT=7860
43
+ APP_WEBSERVER_URL="http://localhost:$(APP_SERVER_PORT)"
44
+
45
+ # ----------------------------- Python-specific -------------------------------
46
+ # - Checking what type of python one is using
47
+ # Anaconda
48
+ ifeq (,$(shell which conda))
49
+ HAS_CONDA=False
50
+ else
51
+ HAS_CONDA=True
52
+ # We need to specify the following commands in order to properly activate the
53
+ # Anaconda environment.
54
+ SHELL=/bin/bash
55
+ # Note that the extra activate is needed to ensure that the activate floats env to the front of PATH
56
+ CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
57
+ CONDA_DEACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda deactivate
58
+ endif
59
+
60
+ # - Pyenv
61
+ ifeq (,$(shell which pyenv))
62
+ HAS_PYENV=False
63
+ else
64
+ HAS_PYENV=True
65
+ endif
66
+
67
+ ###############################################################################
68
+ # VARIABLES FOR COMMANDS #
69
+ ###############################################################################
70
+
71
+ ## Show the set of input parameters
72
+ show-params:
73
+ @ printf "\n-------- GENERAL ---------------\n"
74
+ @ echo "PROJECT_DIR: $(PROJECT_DIR)"
75
+ @ echo "PROJECT_NAME: $(PROJECT_NAME)"
76
+ @ echo "LOCAL_DEVELOPMENT_DIR_PATH: $(LOCAL_DEVELOPMENT_DIR_PATH)"
77
+ @ echo "ENVIRONMENT_NAME: $(ENVIRONMENT_NAME)"
78
+ @ echo "PYTHON_INTERPRETER: $(PYTHON_INTERPRETER)"
79
+ @ echo "PYTHON_VERSION: $(PYTHON_VERSION)"
80
+ @ echo "PIP_VERSION: $(PIP_VERSION)"
81
+ @ echo "REQUIREMENTS_FILE: $(REQUIREMENTS_FILE)"
82
+ @ echo "REQUIREMENTS_FILE_TEMP: $(REQUIREMENTS_FILE_TEMP)"
83
+ @ echo "REQUIREMENTS_DEV_FILE: $(REQUIREMENTS_DEV_FILE)"
84
+ @ echo "REQUIREMENTS_DEV_FILE_TEMP: $(REQUIREMENTS_DEV_FILE_TEMP)"
85
+ @ echo "REQUIREMENTS_DEPLOYMENT_FILE: $(REQUIREMENTS_DEPLOYMENT_FILE)"
86
+ @ echo "REQUIREMENTS_DEPLOYMENT_FILE_TEMP: $(REQUIREMENTS_DEPLOYMENT_FILE_TEMP)"
87
+ @ printf "\n-------- DOCKER ---------------\n"
88
+ @ echo "DOCKER_BUILDKIT_VALUE: $(DOCKER_BUILDKIT_VALUE)"
89
+ @ printf "\n-------- PYTHON ---------------\n"
90
+ @ echo "HAS_CONDA: $(HAS_CONDA)"
91
+ @ echo "HAS_PYENV: $(HAS_PYENV)"
92
+ @ printf "\n-------- LOCAL DEVELOPMENT ---------------\n"
93
+ @ echo "LOCAL_DEV_DOCKER_PROJECT_NAME: $(LOCAL_DEV_DOCKER_PROJECT_NAME)"
94
+ @ echo "LOCAL_DEV_SERVICE_NAME: $(LOCAL_DEV_SERVICE_NAME)"
95
+ @ printf "\n-------- API ---------------\n"
96
+ @ echo "APP_PORT: $(APP_PORT)"
97
+ @ echo "APP_WEBSERVER_URL: $(APP_WEBSERVER_URL)"
98
+ @ echo "API_SERVICE_NAME: $(API_SERVICE_NAME)"
99
+ @ echo "API_DOCKER_PROJECT_NAME: $(API_DOCKER_PROJECT_NAME)"
100
+ @ printf "\n-----------------------\n"
101
+
102
+ ## Initialize the repository for code development
103
+ init: clean create-envrc delete-environment create-environment
104
+ ifeq (True,$(HAS_CONDA))
105
+ @ ($(CONDA_ACTIVATE) $(ENVIRONMENT_NAME) ; $(MAKE) requirements)
106
+ @ printf "\n\n>>> New Conda environment created. Activate with: \n\t: conda activate $(ENVIRONMENT_NAME)"
107
+ @ $(MAKE) show-params
108
+ @ printf "\n\n>>> Project initialized!"
109
+ @ ($(CONDA_ACTIVATE) $(ENVIRONMENT_NAME) ; $(MAKE) pre-commit-install )
110
+ @ ($(CONDA_ACTIVATE) $(ENVIRONMENT_NAME) ; $(MAKE) lint )
111
+ else
112
+ @ direnv allow || echo ""
113
+ @ echo ">>> Continuing installation ..."
114
+ @ $(MAKE) requirements
115
+ @ $(MAKE) show-params
116
+ @ printf "\n\n>>> Project initialized!\n"
117
+ @ $(MAKE) pre-commit-install
118
+ @ $(MAKE) lint
119
+ endif
120
+
121
+ ## Remove ALL of the artifacts + Python environments
122
+ destroy: clean pre-commit-uninstall delete-environment
123
+ @ echo ">>> Deleted all artifacts and environments!"
124
+
125
+ ###############################################################################
126
+ # MISCELLANEOUS COMMANDS #
127
+ ###############################################################################
128
+
129
+ # -------------------- Functions for cleaning repository ----------------------
130
+
131
+ ## Removes artifacts from the build stage, and other common Python artifacts.
132
+ clean: clean-build clean-pyc clean-test clean-secrets clean-model-files clean-images
133
+
134
+ ## Removes Python file artifacts
135
+ clean-pyc:
136
+ find . -name '*.pyc' -exec rm -f {} +
137
+ find . -name '*.pyo' -exec rm -f {} +
138
+ find . -name '*~' -exec rm -f {} +
139
+ find . -name '__pycache__' -exec rm -fr {} +
140
+
141
+ ## Remove build artifacts
142
+ clean-build:
143
+ rm -fr build/
144
+ rm -fr dist/
145
+ rm -fr .eggs/
146
+ find . -name '*.egg-info' -exec rm -fr {} +
147
+ find . -name '*.egg' -exec rm -f {} +
148
+
149
+ ## Remove test and coverage artifacts
150
+ clean-test:
151
+ rm -fr .tox/
152
+ rm -f .coverage
153
+ rm -fr htmlcov/
154
+ rm -fr .pytest_cache
155
+
156
+ ## Remove files related to pre-trained models
157
+ clean-model-files:
158
+ find . -name '*.pt' -exec rm -fr {} +
159
+ find . -name "runs" -type d -exec rm -rf {} + || echo ""
160
+
161
+ ## Clean left-over images
162
+ clean-images:
163
+ find . -name '*.png' -exec rm -fr {} +
164
+ find . -name '*.jpg' -exec rm -fr {} +
165
+
166
+ ## Removes secret artifacts - Serverless
167
+ clean-secrets:
168
+ find . -name "node_modules" -type d -exec rm -rf {} + || echo ""
169
+ find . -name ".serverless" -type d -exec rm -rf {} + || echo ""
170
+
171
+ # ---------------------- Functions for local environment ----------------------
172
+
173
+ ## Set up the envrc file for the project.
174
+ create-envrc:
175
+ @ echo "cat $(PROJECT_DIR)/template.envrc > $(PROJECT_DIR)/.envrc"
176
+ @ cat $(PROJECT_DIR)/template.envrc > $(PROJECT_DIR)/.envrc
177
+
178
+ ## Delete the local envrc file of the project
179
+ delete-envrc:
180
+ @ rm -rf $(PROJECT_DIR)/.envrc || echo ""
181
+
182
+ ## Install git-flow
183
+ git-flow-install:
184
+ @ (( if [[ ! -f "`which git-flow`" ]]; then \
185
+ echo "No Git-flow installed"! ; \
186
+ if [[ -f "`which brew`" ]]; then \
187
+ echo "Homebrew installed"; \
188
+ HOMEBREW_NO_AUTO_UPDATE=1 brew install git-flow; \
189
+ elif [[ -f "`which apt-get`" ]]; then \
190
+ echo "Apt-get installed"; \
191
+ apt-get install git-flow; \
192
+ else \
193
+ echo "Could not locate package manager! (brew or apt-get)"; \
194
+ fi; \
195
+ fi ) && git flow init -f -d) || echo "Git-Flow setup could not be completed"
196
+
197
+
198
+ # ---------------------- Functions for Python environment ---------------------
199
+
200
+ ## Creates the Python environment
201
+ create-environment:
202
+ ifeq (True,$(HAS_CONDA))
203
+ @ echo ">>> Detected CONDA ... Creating new conda environment!"
204
+ @ echo ">>> \tCreating environment: \t $(ENVIRONMENT_NAME)"
205
+ @ conda create --name $(ENVIRONMENT_NAME) python=$(PYTHON_VERSION) -y || echo ""
206
+ @ echo ">>> New conda environment created. Activate with: \n conda activate $(ENVIRONMENT_NAME)"
207
+ else ifeq (True,$(HAS_PYENV))
208
+ @ echo ">>> Detected PYENV ... Creating new Pyenv environment!"
209
+ @ echo ">>> \tCreating environment: \t $(ENVIRONMENT_NAME)"
210
+ @ pyenv virtualenv $(PYTHON_VERSION) $(ENVIRONMENT_NAME) || echo ""
211
+ @ pyenv local $(ENVIRONMENT_NAME)
212
+ @ echo ">>> New Pyenv environment created: '$(ENVIRONMENT_NAME)'"
213
+ @ pyenv virtualenvs
214
+ @ echo
215
+ endif
216
+
217
+ ## Deletes the Python environment
218
+ delete-environment:
219
+ ifeq (True,$(HAS_CONDA))
220
+ @ echo ">>> Detected CONDA ... Deleting Conda environment, if applicable!"
221
+ @ echo ">>> Deleting environment: '$(ENVIRONMENT_NAME)'"
222
+ @ ($(CONDA_DEACTIVATE) ; conda env remove --name $(ENVIRONMENT_NAME) -y) || echo ""
223
+ @ echo ">>> Conda environment deleted: '$(ENVIRONMENT_NAME)'"
224
+ else ifeq (True,$(HAS_PYENV))
225
+ @ echo ">>> Detected PYENV ... Deleting Pyenv environment!"
226
+ @ echo ">>> Deleting environment: '$(ENVIRONMENT_NAME)'"
227
+ @ pyenv uninstall -f $(ENVIRONMENT_NAME) || echo ""
228
+ @ rm $(PROJECT_DIR)/.python-version || echo ""
229
+ @ echo ">>> Pyenv environment deleted: '$(ENVIRONMENT_NAME)'"
230
+ @ pyenv virtualenvs
231
+ @ echo
232
+ endif
233
+
234
+ ## Upgrade the version of the 'pip' package
235
+ pip-upgrade:
236
+ @ $(PYTHON_INTERPRETER) -m pip install --no-cache-dir -q --upgrade pip==$(PIP_VERSION)
237
+
238
+ ## Sort the project packages requirements file
239
+ sort-requirements:
240
+ @ sort $(REQUIREMENTS_FILE) | grep "\S" > $(REQUIREMENTS_FILE_TEMP) && \
241
+ mv $(REQUIREMENTS_FILE_TEMP) $(REQUIREMENTS_FILE)
242
+ @ sort $(REQUIREMENTS_DEV_FILE) | grep "\S" > $(REQUIREMENTS_DEV_FILE_TEMP) && \
243
+ mv $(REQUIREMENTS_DEV_FILE_TEMP) $(REQUIREMENTS_DEV_FILE)
244
+ @ sort $(REQUIREMENTS_DEPLOYMENT_FILE) | grep "\S" > $(REQUIREMENTS_DEPLOYMENT_FILE_TEMP) && \
245
+ mv $(REQUIREMENTS_DEPLOYMENT_FILE_TEMP) $(REQUIREMENTS_DEPLOYMENT_FILE)
246
+
247
+
248
+ ## Install Python dependencies into the Python environment
249
+ requirements: pip-upgrade sort-requirements
250
+ @ $(PYTHON_INTERPRETER) -m pip install --no-cache-dir -q -r $(REQUIREMENTS_DEV_FILE)
251
+
252
+ # -------------------------- Functions for Code Linting -----------------------
253
+
254
+ ## Installing the pre-commit Git hook
255
+ pre-commit-install:
256
+ @ pre-commit install
257
+
258
+ ## Uninstall the pre-commit Git hook
259
+ pre-commit-uninstall:
260
+ @ pre-commit uninstall
261
+
262
+ ## Run the 'pre-commit' linting step manually
263
+ lint:
264
+ @ pre-commit run -a --hook-stage manual
265
+
266
+
267
+ ###############################################################################
268
+ # Docker Commands - Local development #
269
+ ###############################################################################
270
+
271
+ LOCAL_DEV_DOCKER_PROJECT_NAME="$(PROJECT_NAME_LOWER)_localdev_dind"
272
+ LOCAL_DEV_SERVICE_NAME="local-dev"
273
+
274
+ ## Clean Docker images
275
+ docker-prune:
276
+ @ docker system prune -f
277
+
278
+ ## Stops both the API service and the local development service
279
+ all-stop: api-stop docker-local-dev-stop app-stop
280
+ @ echo "All services are down"
281
+
282
+ ## Starts both the API service and the local development service
283
+ all-start: api-start docker-local-dev-start app-start
284
+ @ echo "All services are up!"
285
+
286
+ ## Build local development Docker image
287
+ docker-local-dev-build: docker-prune
288
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
289
+ docker compose \
290
+ --project-name $(LOCAL_DEV_DOCKER_PROJECT_NAME) \
291
+ build $(LOCAL_DEV_SERVICE_NAME)
292
+
293
+ ## Start service for local development
294
+ docker-local-dev-start: docker-local-dev-build docker-local-dev-stop
295
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
296
+ docker compose \
297
+ --project-name $(LOCAL_DEV_DOCKER_PROJECT_NAME) \
298
+ up -d $(LOCAL_DEV_SERVICE_NAME)
299
+
300
+ ## Stop service for local development
301
+ docker-local-dev-stop:
302
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
303
+ docker compose \
304
+ --project-name $(LOCAL_DEV_DOCKER_PROJECT_NAME) \
305
+ stop $(LOCAL_DEV_SERVICE_NAME)
306
+ @ $(MAKE) docker-prune
307
+
308
+ ## Start a shell session into the docker container
309
+ docker-local-dev-login:
310
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
311
+ docker compose \
312
+ --project-name $(LOCAL_DEV_DOCKER_PROJECT_NAME) \
313
+ exec \
314
+ $(LOCAL_DEV_SERVICE_NAME) /bin/zsh
315
+
316
+ ###############################################################################
317
+ # Docker Commands - API-related #
318
+ ###############################################################################
319
+
320
+ API_DOCKER_PROJECT_NAME="$(PROJECT_NAME_LOWER)_api"
321
+ API_SERVICE_NAME="api"
322
+
323
+ ## Build API Docker image
324
+ api-build: docker-prune
325
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
326
+ docker compose \
327
+ --project-name $(API_DOCKER_PROJECT_NAME) \
328
+ build $(API_SERVICE_NAME)
329
+
330
+ ## Start API Docker image container
331
+ api-start: api-stop api-build
332
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
333
+ docker compose \
334
+ --project-name $(API_DOCKER_PROJECT_NAME) \
335
+ up -d $(API_SERVICE_NAME)
336
+
337
+ ## Stop API Docker image container
338
+ api-stop:
339
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
340
+ docker compose \
341
+ --project-name $(API_DOCKER_PROJECT_NAME) \
342
+ stop $(API_SERVICE_NAME)
343
+ @ $(MAKE) docker-prune
344
+
345
+ ## Open API in web browser
346
+ api-web:
347
+ @ python -m webbrowser "$(API_WEBSERVER_URL)/docs"
348
+
349
+ ###############################################################################
350
+ # Docker Commands - App-related #
351
+ ###############################################################################
352
+
353
+ APP_DOCKER_PROJECT_NAME="$(PROJECT_NAME_LOWER)_app"
354
+ APP_SERVICE_NAME="app"
355
+
356
+ ## Build App Docker image
357
+ app-app-build: docker-prune
358
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
359
+ docker compose \
360
+ --project-name $(APP_DOCKER_PROJECT_NAME) \
361
+ build $(APP_SERVICE_NAME)
362
+
363
+ ## Start App Docker image container
364
+ app-app-start: app-app-stop app-app-build
365
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
366
+ docker compose \
367
+ --project-name $(APP_DOCKER_PROJECT_NAME) \
368
+ up -d $(APP_SERVICE_NAME)
369
+
370
+ ## Stop App Docker image container
371
+ app-app-stop:
372
+ @ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
373
+ docker compose \
374
+ --project-name $(APP_DOCKER_PROJECT_NAME) \
375
+ stop $(APP_SERVICE_NAME)
376
+ @ $(MAKE) docker-prune
377
+
378
+ ## Open App in web browser
379
+ app-app-web:
380
+ @ python -m webbrowser "$(APP_WEBSERVER_URL)"
381
+
382
+ ###############################################################################
383
+ # Unit Tests and Code checking #
384
+ ###############################################################################
385
+
386
+ # See: https://github.com/google/addlicense for more information
387
+ ## Add licenses to Python files
388
+ add-licenses:
389
+ @ docker run -it \
390
+ -v ${PWD}:/src \
391
+ ghcr.io/google/addlicense \
392
+ -f ./LICENSE.rst \
393
+ ./src/**/*.py
394
+
395
+ ## Open up all web endpoints
396
+ all-web: api-web app-app-web
397
+ @ echo "All web endpoints opened!"
398
+
399
+ ###############################################################################
400
+ # PROJECT AND DATA FUNCTIONS #
401
+ ###############################################################################
402
+
403
+ DATASET_PATH="https://raw.githubusercontent.com/hamzafarooq/maven-mlsystem-design-cohort-1/main/data/df_embed.csv"
404
+ DATASET_WITH_SUMMARIES_NAME="cicero_dataset_with_summaries"
405
+ DATASET_WITH_FAISS_AND_EMBEDDINGS_NAME="cicero_dataset_with_embeddings_and_faiss_index"
406
+ HUGGING_FACE_REPOSITORY_NAME="cicero_synthesizer"
407
+ FAISS_OUTPUT_FILENAME="cicero_faiss_index"
408
+
409
+ ## Run the data preparation on the input dataset
410
+ prepare_data:
411
+ @ $(PYTHON_INTERPRETER) \
412
+ $(DATA_PROCESSING_DIRECTORY)/prepare_dataset.py \
413
+ --dataset-path $(DATASET_PATH)
414
+
415
+
416
+ ## Run the script for creating a FAISS index and text embeddings of the dataset
417
+ run_faiss_and_embeddings:
418
+ @ $(PYTHON_INTERPRETER) \
419
+ $(TRAINING_DIRECTORY)/create_faiss_corpus_index.py \
420
+ --dataset-name $(DATASET_WITH_SUMMARIES_NAME) \
421
+ --output-dataset-name $(DATASET_WITH_FAISS_AND_EMBEDDINGS_NAME) \
422
+ --repository-name $(HUGGING_FACE_REPOSITORY_NAME) \
423
+ --faiss-index-name $(FAISS_OUTPUT_FILENAME)
424
+
425
+
426
+
427
+ ###############################################################################
428
+ # Self Documenting Commands #
429
+ ###############################################################################
430
+
431
+ .DEFAULT_GOAL := help
432
+
433
+ # Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
434
+ # sed script explained:
435
+ # /^##/:
436
+ # * save line in hold space
437
+ # * purge line
438
+ # * Loop:
439
+ # * append newline + line to hold space
440
+ # * go to next line
441
+ # * if line starts with doc comment, strip comment character off and loop
442
+ # * remove target prerequisites
443
+ # * append hold space (+ newline) to line
444
+ # * replace newline plus comments by `---`
445
+ # * print line
446
+ # Separate expressions are necessary because labels cannot be delimited by
447
+ # semicolon; see <http://stackoverflow.com/a/11799865/1968>
448
+ help:
449
+ @echo "$$(tput bold)Available rules:$$(tput sgr0)"
450
+ @echo
451
+ @sed -n -e "/^## / { \
452
+ h; \
453
+ s/.*//; \
454
+ :doc" \
455
+ -e "H; \
456
+ n; \
457
+ s/^## //; \
458
+ t doc" \
459
+ -e "s/:.*//; \
460
+ G; \
461
+ s/\\n## /---/; \
462
+ s/\\n/ /g; \
463
+ p; \
464
+ }" ${MAKEFILE_LIST} \
465
+ | LC_ALL='C' sort --ignore-case \
466
+ | awk -F '---' \
467
+ -v ncol=$$(tput cols) \
468
+ -v indent=25 \
469
+ -v col_on="$$(tput setaf 6)" \
470
+ -v col_off="$$(tput sgr0)" \
471
+ '{ \
472
+ printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
473
+ n = split($$2, words, " "); \
474
+ line_length = ncol - indent; \
475
+ for (i = 1; i <= n; i++) { \
476
+ line_length -= length(words[i]) + 1; \
477
+ if (line_length <= 0) { \
478
+ line_length = ncol - indent - length(words[i]) - 1; \
479
+ printf "\n%*s ", -indent, " "; \
480
+ } \
481
+ printf "%s ", words[i]; \
482
+ } \
483
+ printf "\n"; \
484
+ }' \
485
+ | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
README.md CHANGED
@@ -1,11 +1,282 @@
1
- ---
2
- title: Cicero Synthesizer Space
3
- emoji:
4
- colorFrom: purple
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/Maven-Building-LLMS-into-Production/Cicero-LLM-Synthesizer/code-linting.yml)
2
+
3
+ # Cicero LLM Synthesizer
4
+
5
+ ## Contents
6
+
7
+ - [Setup](#setup)
8
+ - [Setup for local code development](#setup-for-local-code-development)
9
+ - [Makefile](#makefile)
10
+ - [Starting up the Docker container and initializing the repository](#starting-up-the-docker-container-and-initializing-the-repository)
11
+ - [Starting the API service](#starting-the-api-service)
12
+ - [Starting up all the services](#starting-up-all-the-services)
13
+ - [Tests](#tests)
14
+ - [Helpful Commands](#helpful-commands)
15
+ - [VS Code Extensions](#vs-code-extensions)
16
+ - [GPT3.5 summaries](#gpt35-summaries)
17
+ - [Resources](#resources)
18
+
19
+ ## Setup
20
+
21
+ Ensure you have python and pip installed.
22
+
23
+ ```shell
24
+ python --version
25
+ pip --version
26
+ ```
27
+
28
+ From the root directory run the following command to install the
29
+ dependencies: `pip install -r requirements.txt`
30
+
31
+ You can run the app using this command: `python -m uvicorn src.api.index:app --reload`
32
+
33
+ Once running you can navigate to `http://127.0.0.1:8000/docs` to view the
34
+ interactive API documentation.
35
+
36
+ ## Setup for local code development
37
+
38
+ There are some steps that need to be done prior to being able to
39
+ properly run and develop the code in this repository.
40
+
41
+ The following is a list of steps that have to happen prior to starting to
42
+ work / test the pipelines of this repository:
43
+
44
+ ### Makefile
45
+
46
+ The project comes with a `Makefile` (**not supported in Windows!**)
47
+ that can be used for executing commands that will make the interaction
48
+ with this project much smoother. Keep in mind that folders with spaces in their names may cause issues.
49
+
50
+ One can see all of the available options by:
51
+
52
+ ```bash
53
+ $: make
54
+
55
+ Available rules:
56
+
57
+ add-licenses Add licenses to Python files
58
+ all-start Starts both the API service and the local development service
59
+ all-stop Stops both the API service and the local development service
60
+ all-web Open up all web endpoints
61
+ api-build Build API Docker image
62
+ api-start Start API Docker image container
63
+ api-stop Stop API Docker image container
64
+ api-web Open API in web browser
65
+ app-app-build Build App Docker image
66
+ app-app-start Start App Docker image container
67
+ app-app-stop Stop App Docker image container
68
+ app-app-web Open App in web browser
69
+ clean Removes artifacts from the build stage, and other common Python artifacts.
70
+ clean-build Remove build artifacts
71
+ clean-images Clean left-over images
72
+ clean-model-files Remove files related to pre-trained models
73
+ clean-pyc Removes Python file artifacts
74
+ clean-secrets Removes secret artifacts - Serverless
75
+ clean-test Remove test and coverage artifacts
76
+ create-environment Creates the Python environment
77
+ create-envrc Set up the envrc file for the project.
78
+ delete-environment Deletes the Python environment
79
+ delete-envrc Delete the local envrc file of the project
80
+ destroy Remove ALL of the artifacts + Python environments
81
+ docker-local-dev-build Build local development Docker image
82
+ docker-local-dev-login Start a shell session into the docker container
83
+ docker-local-dev-start Start service for local development
84
+ docker-local-dev-stop Stop service for local development
85
+ docker-prune Clean Docker images
86
+ git-flow-install Install git-flow
87
+ init Initialize the repository for code development
88
+ lint Run the 'pre-commit' linting step manually
89
+ pip-upgrade Upgrade the version of the 'pip' package
90
+ pre-commit-install Installing the pre-commit Git hook
91
+ pre-commit-uninstall Uninstall the pre-commit Git hook
92
+ prepare_data Run the data preparation on the input dataset
93
+ requirements Install Python dependencies into the Python environment
94
+ run_faiss_and_embeddings Run the script for creating a FAISS index and text embeddings of the dataset
95
+ show-params Show the set of input parameters
96
+ sort-requirements Sort the project packages requirements file
97
+ ```
98
+
99
+ > **NOTE**: If you're using `Windows`, you may have to copy and modify to some
100
+ > extents the commands that are part of the `Makefile` for some tasks.
101
+
102
+ ### Starting up the Docker container and initializing the repository
103
+
104
+ In order to work on current / new features, one can use *Docker* to
105
+ start a new container and start the local development process.
106
+
107
+ To build the Docker image, one must follow the following steps:
108
+
109
+ 1. Start the Docker daemon. If you're using Mac, one can use the
110
+ Docker Desktop App.
111
+ 2. Go the project's directory and run the following command using the `Makefile`:
112
+ ```bash
113
+ # Go the project's directory
114
+ cd /path/to/directory
115
+
116
+ # Build the Docker iamge and start a container
117
+ make docker-local-dev-start
118
+ ```
119
+ 3. Log into the container
120
+ ```bash
121
+ # Log into the container
122
+ make docker-local-dev-login
123
+ ```
124
+
125
+ 4. Once you're inside the container, you'll see the following prompt:
126
+
127
+ ```bash
128
+ # Log into the container
129
+ ???$: make docker-local-dev-login
130
+ direnv: error /opt/program/.envrc is blocked. Run `direnv allow` to approve its content
131
+ ```
132
+ > One will see the `direnv` error because `direnv` is installed and one must
133
+ > *allow* the changes to take effect.
134
+
135
+ 5. Allow for the `direnv` changes
136
+ ```bash
137
+ # Accept the changes
138
+ $: direnv allow
139
+ direnv: loading /opt/program/.envrc
140
+ ```
141
+
142
+ 6. The last thing is to initialize the repository. This can easily be done
143
+ with the `init` command:
144
+
145
+ ```bash
146
+ $: make init
147
+ ```
148
+ This will do the following tasks:
149
+ - Clean Python files
150
+ - Initialize the `.envrc` file used by `direnv`.
151
+ - Delete an existing python environment for the project, if it exists.
152
+ - Creates a new environment, if applicable
153
+ - Apply `direnv allow` to allow for `direnv` modifications.
154
+ - Install package requirements via `pip`
155
+ - Install `pre-commit` for code-linting and code-checking.
156
+ - Install `git-flow`, whenever possible.
157
+
158
+ These steps allow for the user to be able to develop new feature within
159
+ Docker, which makes it easier for developers to have the exact same set of
160
+ tools available.
161
+
162
+ ## Starting the API service
163
+
164
+ The project comes with an out-of-the-box solution for starting and stopping
165
+ the API endpoint via Docker.
166
+
167
+ To start the container with the API endpoint, one must run the following
168
+ command:
169
+
170
+ ```bash
171
+ # Start API service
172
+ make api-start
173
+ ```
174
+
175
+ This service will start a Docker container that exposes the internal port
176
+ `7860` to the local host's port `7860`. Once the image has been built and
177
+ a container has started, one can go to the service's main page by using
178
+ the following command:
179
+
180
+ ```bash
181
+ # Go the URL of the API endpoint
182
+ make api-web
183
+ ```
184
+
185
+ > This will direct the user to the following URL:
186
+ > [http://localhost:7860/docs](http://localhost:7860/docs)
187
+
188
+ In order to *stop* the API service, one can run the following command:
189
+
190
+ ```bash
191
+ # Stop the API service
192
+ make api-stop
193
+ ```
194
+
195
+ As one customizes the FastAPI with new features and more, these changes
196
+ will be automatically displayed in the URL from above.
197
+
198
+ ### Starting up all the services
199
+
200
+ Similar to the sections from above, one can spin up or spin down all the
201
+ services at once with the help of 2 commands, i.e. `all-start` and `all-stop`.
202
+
203
+ In order to spin up both the *api* service and that for *local development*,
204
+ one can run:
205
+
206
+ ```bash
207
+ make all-start
208
+ ```
209
+
210
+ This command will execute both services and one will be able to log into the
211
+ container for local development, as well to connect to the API via the
212
+ browser.
213
+
214
+ Similarly, in order to spin down all of the services, one can simply run:
215
+
216
+ ```bash
217
+ make all-stop
218
+ ```
219
+
220
+ This will stop both services and delete any unused Docker containers.
221
+
222
+ ## Tests
223
+
224
+ Unit tests can be found under the `src` folder alongside source code.
225
+ Test files end with `_test`. The following command will run all of the tests.
226
+
227
+ ```shell
228
+ python -m pytest -v -s
229
+ ```
230
+
231
+ The `-v` argument is for verbose output. The `-s` argument is for turning
232
+ off the capture mode so that print statements are printed to the console.
233
+
234
+ A Makefile command also exists to run these. See `make test`.
235
+
236
+ ## Helpful Commands
237
+
238
+ Here is a list of commands that may be helpful when interacting with this project.
239
+
240
+ ### Docker
241
+
242
+ List all Docker containers:
243
+
244
+ ```shell
245
+ docker ps -a
246
+ ```
247
+
248
+ ## VS Code Extensions
249
+
250
+ To help facilitate local development you can install
251
+ the [Visual Studio Code Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
252
+ extension for VS Code. This will allow you to connect to the local development Docker container and more easily develop features.
253
+
254
+ ## GPT3.5 summaries
255
+
256
+ To generate the GPT3.5 summaries for all articles, use the following commands:
257
+
258
+ ```
259
+ cd src
260
+ python3 -m utils.gpt35_summaries.cleanup_and_summarize
261
+ ```
262
+
263
+ The output CSV file is placed in `src/utils/gpt35_summaries/df_embed_out.csv`
264
+ The pre-generated summaries for all articles are in `df_embed_out2.csv` in the same directory.
265
+
266
+ For an example of a focussed summary, please see `src/focused_summary_example.py`.
267
+
268
+ ## Resources
269
+
270
+ - [direnv](https://github.com/direnv/direnv)
271
+ - [Docker](https://docs.docker.com/reference/)
272
+ - [Docker Compose](https://docs.docker.com/compose/)
273
+ - [FastAPI](https://fastapi.tiangolo.com/)
274
+ - [flake8](https://flake8.pycqa.org/en/latest/)
275
+ - [git](https://git-scm.com/)
276
+ - [GitHub Actions](https://docs.github.com/en/actions)
277
+ - [isort](https://pycqa.github.io/isort/index.html)
278
+ - [Makefile](https://www.gnu.org/software/make/manual/make.html)
279
+ - [Markdown](https://www.markdownguide.org/)
280
+ - [pre-commit](https://pre-commit.com)
281
+ - [Python](https://www.python.org/)
282
+ - [tmux](https://github.com/tmux/tmux/wiki/Getting-Started)
data/cicero_faiss_index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bf0b06752ccad29f09484b07bf65d429ec60a203e3697cc14661cee28447d37
3
+ size 3511341
data/clean_dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea0aafebd94acbbe0dbd7370d01a34b49284d5012ffadf3fe95130e83f2f13fd
3
+ size 13751626
data/raw_dataset.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bb32f8b3a313d28aa704e84335ae9d8aefa039f405677a5054aeee67a75cbeb
3
+ size 15756643
docker/aliases.sh ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is a compilation of useful 'USER-DEFINED' aliases to use
2
+
3
+ alias tmux_create='tmux new -s' # Creates new tmux session.
4
+ alias tmux_attach='tmux a -t' # Attaches to an existing tmux session.
5
+ alias tmux_ls='tmux ls' # Lists all of the existing tmux sessions.
6
+ alias tmux_kill="tmux kill-session -t " # Kill a specific tmux session
7
+ alias gadd='git add' # Adds a file / directory to repository
8
+ alias gcom='git commit -m' # Commits any changes. Use as: gcom "Test"
9
+ alias gp='git push origin master' # Pushes changes to 'master'
10
+ alias gst='git status' # Shows the status of the GIT repository.
11
+ alias sagent="eval $(ssh-agent -s)" # Start SSH key agent
12
+ alias sa='conda activate' # Activates an Anaconda environment
13
+ alias sd='conda deactivate' # Deactivates an Anaconda environment
14
+ alias jl='jupyter lab --ip 0.0.0.0 --port 8890 --no-browser --allow-root' # Opens 'Jupyter Lab'
15
+ alias jn='jupyter notebook --ip 0.0.0.0 --port 8890 --no-browser --allow-root' # Opens 'Jupyter Notebook'
16
+ alias lll="ls -lah"
17
+ # Docker-related
18
+ alias dps="docker ps -a"
19
+ alias dprune='docker system prune -f'
20
+ alias dallow="direnv allow"
docker/docker-compose.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3'
2
+
3
+ services:
4
+ # --- Service used for local development
5
+ local-dev:
6
+ # Building the local image
7
+ build:
8
+ context: ../
9
+ dockerfile: ./Dockerfile
10
+ # Running the local image
11
+ image: "cicero-synthesizer-local-dev"
12
+ container_name: "cicero-synthesizer-local-dev"
13
+ environment:
14
+ DOCKER_BUILDKIT_VALUE: ${DOCKER_BUILDKIT_VALUE}
15
+ HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN}
16
+ volumes:
17
+ - ${HOME}/.ssh:/root/.ssh
18
+ - /var/run/docker.sock:/var/run/docker.sock
19
+ - ..:/opt/program
20
+ working_dir: /opt/program
21
+ command: [ "/bin/sleep", "365d" ]
22
+ #
23
+ # --- Service for running the API locally
24
+ api:
25
+ # Building the local image
26
+ build:
27
+ context: ../
28
+ dockerfile: ./Dockerfile
29
+ # Running the local image
30
+ image: "cicero-synthesizer-api"
31
+ container_name: "cicero-synthesizer-api"
32
+ environment:
33
+ HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN}
34
+ HUGGING_FACE_USERNAME: ${HUGGING_FACE_USERNAME}
35
+ volumes:
36
+ - ..:/opt/ml
37
+ ports:
38
+ - ${INPUT_APP_PORT:-8501}:${OUTPUT_APP_PORT:-8501}
39
+ working_dir: /opt/ml
40
+ command:
41
+ [
42
+ "uvicorn",
43
+ "src.api.index:app",
44
+ "--host",
45
+ "0.0.0.0",
46
+ "--port",
47
+ "8501",
48
+ "--reload",
49
+ "--reload-dir",
50
+ "/opt/ml"
51
+ ]
52
+ #
53
+ # --- Service for running the Gradio application locally
54
+ app:
55
+ # Building the local image
56
+ build:
57
+ context: ../
58
+ dockerfile: ./Dockerfile
59
+ # Running the local image
60
+ image: "cicero-synthesizer-app"
61
+ container_name: "cicero-synthesizer-app"
62
+ environment:
63
+ APP_SERVER_PORT: ${APP_SERVER_PORT}
64
+ HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN}
65
+ HUGGING_FACE_USERNAME: ${HUGGING_FACE_USERNAME}
66
+ volumes:
67
+ - ..:/opt/ml
68
+ ports:
69
+ - ${APP_SERVER_PORT:-7860}:${APP_SERVER_PORT:-7860}
70
+ working_dir: /opt/ml
71
+ command: [ "python", "src/app_service/app.py" ]
pyproject.toml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.isort]
2
+ force_grid_wrap = 0
3
+ include_trailing_comma = true
4
+ line_length = 79
5
+ multi_line_output = 3
6
+ use_parentheses = true
7
+
8
+ [tool.pytest.ini_options]
9
+ addopts = """
10
+ --cov-report term-missing \
11
+ --cov src/ -ra"""
12
+
13
+ [tool.black]
14
+ exclude = '''
15
+ /(
16
+ \.eggs
17
+ | \.git
18
+ | \.hg
19
+ | \.mypy_cache
20
+ | \.tox
21
+ | \.venv
22
+ | _build
23
+ | buck-out
24
+ | build
25
+ | dist
26
+ # The following are specific to Black, you probably don't want those.
27
+ | blib2to3
28
+ | tests/data
29
+ | profiling
30
+ )/
31
+ '''
32
+ include = '\.pyi?$'
33
+ line-length = 79
34
+ target-version = ['py36', 'py37', 'py38', 'py39']
requirements-deploy.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ docker==5.0.3
requirements-dev.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -r ./requirements.txt
2
+ black==22.3.0
3
+ click>=8.0.2
4
+ docker==5.0.3
5
+ flake8==5.0.4
6
+ ipython>=7.0.1
7
+ isort>=4.3.21, <5.0
8
+ jupyter_client>=5.1, <7.0
9
+ jupyterlab>=0.31.1
10
+ jupyter~=1.0
11
+ pre-commit==2.10.1
12
+ protobuf==3.20.1
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets>=2.13.1
2
+ faiss-cpu>=1.7.4
3
+ fastapi==0.92.0
4
+ gradio>=3.35.2
5
+ huggingface_hub>=0.15.1
6
+ openai>=0.27.8
7
+ pandas>=2.0.2
8
+ sentence-transformers
9
+ setuptools==67.6.1
10
+ spacy>=3.5.3
11
+ tiktoken>=0.4.0
12
+ torch==2.0.0
13
+ tqdm==4.65.0
14
+ uvicorn==0.20.0
src/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/api/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
src/api/index.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ import logging
24
+ from typing import Dict, Optional
25
+
26
+ from datasets import Dataset
27
+ from fastapi import Depends, FastAPI
28
+ from fastapi.responses import RedirectResponse
29
+ from huggingface_hub import hf_hub_download
30
+ from pydantic import BaseModel
31
+
32
+ from src.classes import hugging_face_utils as hf
33
+ from src.classes import semantic_search_engine as ss
34
+ from src.utils import default_variables as dv
35
+
36
+ logger = logging.getLogger(__name__)
37
+ logging.basicConfig(
38
+ level=logging.INFO,
39
+ format="%(asctime)s [%(levelname)s]: %(message)s",
40
+ )
41
+ logger.setLevel(logging.INFO)
42
+
43
+
44
+ # ------------------------------- VARIABLES -----------------------------------
45
+
46
+ APP_TITLE = "Cicero LLM Synthesizer"
47
+ APP_DESCRIPTION = f"""
48
+ The '{APP_TITLE}'is an app that will identify the top-N articles from the
49
+ Cicero database that are most similar to the user's input query.
50
+ """
51
+ APP_VERSION = "0.1"
52
+
53
+
54
+ # ----------------------------- APP-SPECIFIC ----------------------------------
55
+
56
+ # Defining the appliation value
57
+ app = FastAPI(
58
+ title=APP_TITLE,
59
+ description=APP_DESCRIPTION,
60
+ version=APP_VERSION,
61
+ )
62
+
63
+ # -------------------------------- CLASSES ------------------------------------
64
+
65
+
66
+ class QueryParams(BaseModel):
67
+ input_query: str
68
+ number_articles: Optional[int] = 5
69
+
70
+
71
+ # ------------------------------- FUNCTIONS -----------------------------------
72
+
73
+
74
+ def download_dataset_and_faiss_index() -> Dataset:
75
+ """
76
+ Function to download the corresponding dataset and the FAISS index
77
+ from HuggingFace.
78
+
79
+ Returns
80
+ -------------
81
+ dataset_with_faiss_index : datasets.Dataset
82
+ Dataset from HuggingFace with the FAISS index loaded.
83
+ """
84
+ # --- Initializing HuggingFace API
85
+ # Object for interacting with HuggingFace
86
+ hf_obj = hf.HuggingFaceHelper()
87
+
88
+ # Defining variable names for each of the objects
89
+ faiss_index_name = f"{dv.faiss_index_name}.faiss"
90
+ dataset_name = dv.dataset_faiss_embeddings_name
91
+ username = hf_obj.username
92
+ repository_name = dv.hugging_face_repository_name
93
+ repository_id = f"{username}/{repository_name}"
94
+ repository_type = "dataset"
95
+ split_type = "train"
96
+
97
+ # --- Downloading FAISS Index
98
+ faiss_index_local_path = hf_hub_download(
99
+ repo_id=repository_id,
100
+ filename=faiss_index_name,
101
+ repo_type=repository_type,
102
+ token=hf_obj.api.token,
103
+ )
104
+
105
+ # --- Downloading Dataset
106
+ dataset_obj = hf_obj.get_dataset_from_hub(
107
+ dataset_name=dataset_name,
108
+ username=username,
109
+ split=split_type,
110
+ )
111
+
112
+ # --- Adding FAISS index to the dataset
113
+ dataset_obj.load_faiss_index(
114
+ index_name=dv.embeddings_colname,
115
+ file=faiss_index_local_path,
116
+ )
117
+
118
+ return dataset_obj
119
+
120
+
121
+ def run_semantic_search_task(query: str, number_articles: int) -> Dict:
122
+ """
123
+ Function to run semantic search on an input query. It will return a
124
+ set of 'Top-N' articles that are most similar to the input query.
125
+
126
+ Parameters
127
+ ------------
128
+ query : str
129
+ Input query to use when running the Semantic Search Engine.
130
+
131
+ number_articles : int
132
+ Number of articles to return from the Semantic Search.
133
+
134
+ Returns
135
+ ----------
136
+ ranked_results : dict
137
+ Dictionary containing the ranked results from the Semantic
138
+ Search Engine.
139
+ """
140
+ # --- Extracting dataset with FAISS index
141
+ corpus_dataset_with_faiss_index = download_dataset_and_faiss_index()
142
+
143
+ # --- Initializing Semantic Search Engine
144
+ semantic_search_obj = ss.SemanticSearchEngine(
145
+ corpus_dataset_with_faiss_index=corpus_dataset_with_faiss_index
146
+ )
147
+
148
+ # --- Running search on Top-N results
149
+
150
+ return semantic_search_obj.run_semantic_search(
151
+ query=query,
152
+ top_n=number_articles,
153
+ )
154
+
155
+
156
+ # -------------------------------- ROUTES -------------------------------------
157
+
158
+
159
+ @app.get("/", include_in_schema=False)
160
+ async def docs_redirect():
161
+ return RedirectResponse(url="/docs")
162
+
163
+
164
+ # ---- Semantic Search
165
+ @app.post("/predict")
166
+ async def run_semantic_search(query_params: QueryParams = Depends()):
167
+ """
168
+ Function to run semantic search on the an input query.
169
+
170
+ Parameters
171
+ --------------
172
+ query : str
173
+ Input query to use when running the Semantic Search Engine.
174
+
175
+ number_articles : int
176
+ Number of articles to return from the Semantic Search.
177
+ """
178
+
179
+ return run_semantic_search_task(
180
+ query=query_params.input_query,
181
+ number_articles=query_params.number_articles,
182
+ )
src/app_service/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
src/app_service/app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ import logging
24
+ from typing import Dict
25
+
26
+ import gradio as gr
27
+ from datasets import Dataset
28
+ from huggingface_hub import hf_hub_download
29
+
30
+ from src.classes import hugging_face_utils as hf
31
+ from src.classes import semantic_search_engine as ss
32
+ from src.utils import default_variables as dv
33
+
34
+ logger = logging.getLogger(__name__)
35
+ logging.basicConfig(
36
+ level=logging.INFO,
37
+ format="%(asctime)s [%(levelname)s]: %(message)s",
38
+ )
39
+ logger.setLevel(logging.INFO)
40
+
41
+
42
+ # ------------------------------ VARIABLES ------------------------------------
43
+
44
+ APP_TITLE = "Cicero LLM Synthesizer"
45
+ APP_DESCRIPTION = f"""
46
+ The '{APP_TITLE}'is an app that will identify the top-N articles from the
47
+ Cicero database that are most similar to the user's input query.
48
+ """
49
+ APP_VERSION = "0.1"
50
+
51
+
52
+ # ------------------------------ FUNCTIONS ------------------------------------
53
+
54
+
55
+ def download_dataset_and_faiss_index() -> Dataset:
56
+ """
57
+ Function to download the corresponding dataset and the FAISS index
58
+ from HuggingFace.
59
+
60
+ Returns
61
+ -------------
62
+ dataset_with_faiss_index : datasets.Dataset
63
+ Dataset from HuggingFace with the FAISS index loaded.
64
+ """
65
+ # --- Initializing HuggingFace API
66
+ # Object for interacting with HuggingFace
67
+ hf_obj = hf.HuggingFaceHelper()
68
+
69
+ # Defining variable names for each of the objects
70
+ faiss_index_name = f"{dv.faiss_index_name}.faiss"
71
+ dataset_name = dv.dataset_faiss_embeddings_name
72
+ username = hf_obj.username
73
+ repository_name = dv.hugging_face_repository_name
74
+ repository_id = f"{username}/{repository_name}"
75
+ repository_type = "dataset"
76
+ split_type = "train"
77
+
78
+ # --- Downloading FAISS Index
79
+ faiss_index_local_path = hf_hub_download(
80
+ repo_id=repository_id,
81
+ filename=faiss_index_name,
82
+ repo_type=repository_type,
83
+ token=hf_obj.api.token,
84
+ )
85
+
86
+ # --- Downloading Dataset
87
+ dataset_obj = hf_obj.get_dataset_from_hub(
88
+ dataset_name=dataset_name,
89
+ username=username,
90
+ split=split_type,
91
+ )
92
+
93
+ # --- Adding FAISS index to the dataset
94
+ dataset_obj.load_faiss_index(
95
+ index_name=dv.embeddings_colname,
96
+ file=faiss_index_local_path,
97
+ )
98
+
99
+ return dataset_obj
100
+
101
+
102
+ def run_semantic_search_task(query: str, number_articles: int) -> Dict:
103
+ # sourcery skip: remove-unnecessary-cast
104
+ """
105
+ Function to run semantic search on an input query. It will return a
106
+ set of 'Top-N' articles that are most similar to the input query.
107
+
108
+ Parameters
109
+ ------------
110
+ query : str
111
+ Input query to use when running the Semantic Search Engine.
112
+
113
+ number_articles : int
114
+ Number of articles to return from the Semantic Search.
115
+
116
+ Returns
117
+ ----------
118
+ ranked_results : dict
119
+ Dictionary containing the ranked results from the Semantic
120
+ Search Engine.
121
+ """
122
+ # --- Extracting dataset with FAISS index
123
+ corpus_dataset_with_faiss_index = download_dataset_and_faiss_index()
124
+
125
+ # --- Initializing Semantic Search Engine
126
+ semantic_search_obj = ss.SemanticSearchEngine(
127
+ corpus_dataset_with_faiss_index=corpus_dataset_with_faiss_index
128
+ )
129
+
130
+ # --- Running search on Top-N results
131
+ number_articles_mod = int(number_articles)
132
+
133
+ results = semantic_search_obj.run_semantic_search(
134
+ query=query,
135
+ top_n=number_articles_mod,
136
+ )
137
+
138
+ return list(results.values())
139
+
140
+
141
+ # --------------------------------- APP ---------------------------------------
142
+
143
+ # -- Semantic Search Engine
144
+ semantic_search_engine = gr.Interface(
145
+ fn=run_semantic_search_task,
146
+ inputs=[
147
+ gr.components.Textbox(label="Input Query"),
148
+ gr.Slider(
149
+ minimum=1,
150
+ label="Choose number of documents to retrieve",
151
+ step=1,
152
+ ),
153
+ ],
154
+ outputs="json",
155
+ title=APP_TITLE,
156
+ description=APP_DESCRIPTION,
157
+ )
158
+
159
+
160
+ # ----------------------------- RUNNING APP -----------------------------------
161
+
162
+ if __name__ == "__main__":
163
+ semantic_search_engine.launch(
164
+ debug=False,
165
+ share=False,
166
+ server_port=7860,
167
+ )
src/classes/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
src/classes/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (184 Bytes). View file
 
src/classes/__pycache__/hugging_face_utils.cpython-39.pyc ADDED
Binary file (5.01 kB). View file
 
src/classes/__pycache__/semantic_search_engine.cpython-39.pyc ADDED
Binary file (5.86 kB). View file
 
src/classes/data_preparation.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ """
24
+ Module that contains the class definitions for the data preparation tasks.
25
+ """
26
+
27
+ import logging
28
+ import re
29
+ from datetime import datetime
30
+ from typing import List, Optional, Tuple, Union
31
+
32
+ import pandas as pd
33
+ from spacy.lang.en.stop_words import STOP_WORDS
34
+
35
+ from src.classes import hugging_face_utils as hf
36
+ from src.utils import default_variables as dv
37
+ from src.utils import general_utilities as gu
38
+
39
+ logger = logging.getLogger(__name__)
40
+ logging.basicConfig(level=logging.INFO)
41
+
42
+ # ---------------------------- CLASS DEFINITIONS ------------------------------
43
+
44
+
45
+ # -- Defining functions that can be used for cleaning up and preparing text
46
+ class NLPPrep(object):
47
+ """
48
+ Class object for handling the data processing of text.
49
+ """
50
+
51
+ def __init__(self):
52
+ # Defining the corresponding stop words
53
+ self.stop_words = list(STOP_WORDS)
54
+
55
+ def _lowercase_text(self, input_string: str) -> str:
56
+ """
57
+ Method for making the input text lowercase.
58
+
59
+ Parameters
60
+ ------------
61
+ input_string : str
62
+ Text variable to lowercase.
63
+
64
+ Returns
65
+ ----------
66
+ output_string : str
67
+ Lower-cased version of ``input_string``.
68
+ """
69
+
70
+ return input_string.lower()
71
+
72
+ def _only_keep_alphanumeric(self, input_string: str) -> str:
73
+ """
74
+ Method for only keeping alphanumerical characters in the text.
75
+
76
+ Parameters
77
+ ------------
78
+ input_string : str
79
+ Text variable to filter.
80
+
81
+ Returns
82
+ ----------
83
+ output_string : str
84
+ Filtered version of ``input_string`` that only contains
85
+ alphanumerical characters.
86
+ """
87
+ regex_pattern = r"[^a-zA-z0-9\s]"
88
+
89
+ return re.sub(regex_pattern, "", input_string)
90
+
91
+ def _remove_stopwords(self, input_string: str) -> str:
92
+ """
93
+ Method for removing stop words from the input text.
94
+
95
+ Parameters
96
+ ------------
97
+ input_string : str
98
+ Text variable to filter.
99
+
100
+ Returns
101
+ ----------
102
+ output_string : str
103
+ Filtered version of ``input_string`` without stop words in
104
+ the text.
105
+ """
106
+ # Splitting the text into 'tokens'
107
+ tokens = input_string.strip().split()
108
+
109
+ return " ".join(
110
+ [word for word in tokens if word not in self.stop_words]
111
+ )
112
+
113
+ def _remove_unicode(self, input_str: str) -> str:
114
+ """
115
+ Method for removing Unicode from the input text.
116
+
117
+ Parameters
118
+ ------------
119
+ input_str : str
120
+ Text variable, from which to remove Unicode characters.
121
+
122
+ Returns
123
+ ----------
124
+ string_decode : str
125
+ Filtered version of ``input_str`` without the Unicode characters.
126
+ """
127
+ string_encode = input_str.encode("ascii", "ignore")
128
+
129
+ return string_encode.decode()
130
+
131
+ def process_text(self, input_string: str) -> str:
132
+ """
133
+ Method for passing the input variable through NLP-based techniques
134
+ to process the text.
135
+
136
+ Parameters
137
+ ------------
138
+ input_string : str
139
+ Variable corresponding to the text that will be processed.
140
+
141
+ Returns
142
+ ------------
143
+ processed_string : str
144
+ Variable corresponding to the *processed* version of the input
145
+ string, after having gone through some NLP-based processing
146
+ techniques.
147
+
148
+ Notes
149
+ -----------
150
+ This function will perform the following NLP-based techniques:
151
+
152
+ 1. Make the text lowercase.
153
+ 2. Remove any non-alphanumeric character from the string.
154
+ 3. Remove any stop words from the text.
155
+ """
156
+ # Remove Unicode characters
157
+ processed_string = self._remove_unicode(input_string)
158
+ # Lower case the text
159
+ processed_string = self._lowercase_text(processed_string)
160
+ # Removing non-alphanumeric characters
161
+ processed_string = self._only_keep_alphanumeric(processed_string)
162
+ # Removing stop words
163
+ processed_string = self._remove_stopwords(processed_string)
164
+
165
+ return processed_string
166
+
167
+
168
+ class DatasetPrep(object):
169
+ """
170
+ Class object for the Data Processing of the input dataset.
171
+ """
172
+
173
+ def __init__(
174
+ self,
175
+ dataset_path: str,
176
+ **kwargs,
177
+ ):
178
+ """
179
+ Class object for the Data Processing of the input dataset.
180
+
181
+ Parameters
182
+ ------------
183
+ dataset_path : str
184
+ Path / URL to the input dataset.
185
+ """
186
+ # Path to the output dataset
187
+ self.datasets_dir = gu.get_project_paths()["data"]
188
+
189
+ # Other parameters
190
+ for colname in [
191
+ "save_to_disk",
192
+ "document_id_colname",
193
+ "title_colname",
194
+ "content_colname",
195
+ "clean_content_colname",
196
+ ]:
197
+ setattr(self, colname, kwargs.get(colname, getattr(dv, colname)))
198
+
199
+ # Initializing dataset
200
+ self.dataset_path = dataset_path
201
+ self.raw_dataset = self._get_dataset()
202
+
203
+ # Extracting the number of rows and columns, and column names
204
+ (
205
+ self.n_rows,
206
+ self.n_cols,
207
+ self.columns_names,
208
+ ) = self._get_columns_and_shape()
209
+
210
+ # Initializing NLP-Prep Object
211
+ self.nlp_obj = NLPPrep()
212
+
213
+ def show_params(self):
214
+ """
215
+ Method for displaying the set of input parameters of the class.
216
+ """
217
+
218
+ gu.show_params(
219
+ params_dict=self.__dict__,
220
+ logger=logger,
221
+ columns_to_omit=["raw_dataset"],
222
+ )
223
+
224
+ def _get_dataset(self) -> pd.DataFrame:
225
+ # sourcery skip: class-extract-method
226
+ """
227
+ Method for extracting the dataset from the input source.
228
+
229
+ Returns
230
+ ----------
231
+ raw_dataset : pandas.DataFrame
232
+ DataFrame containing the data from the input source.
233
+ """
234
+ logger.info(f">> Extracting dataset from `{self.dataset_path}`")
235
+
236
+ # Reading in dataset
237
+ raw_dataset = pd.read_csv(self.dataset_path)
238
+
239
+ # Saving to disk, if applicable
240
+ if self.save_to_disk:
241
+ dataset_filepath = self.datasets_dir.joinpath("raw_dataset.csv")
242
+ dataset_filepath.parent.mkdir(exist_ok=True, parents=True)
243
+ raw_dataset.to_csv(dataset_filepath, header=True, index=True)
244
+
245
+ logger.info(f">> Raw dataset saved to '{str(dataset_filepath)}'")
246
+
247
+ return raw_dataset
248
+
249
+ def _get_columns_and_shape(self) -> Tuple[int, int, List]:
250
+ # sourcery skip: use-fstring-for-formatting
251
+ """
252
+ Method for extracting the columns and information about the
253
+ raw dataset.
254
+
255
+ Returns
256
+ ----------
257
+ n_rows : int
258
+ Number of rows in the original dataset.
259
+
260
+ n_cols : int
261
+ Number of columns in the original dataset.
262
+
263
+ column_names_arr : list
264
+ List of columns from the original dataset.
265
+ """
266
+ # Number of rows and columns
267
+ n_rows, n_columns = self.raw_dataset.shape
268
+
269
+ logger.info(
270
+ ">> There are '{}' rows and '{}' columns in the dataset".format(
271
+ n_rows,
272
+ n_columns,
273
+ )
274
+ )
275
+
276
+ # Column names
277
+ column_names_arr = sorted(self.raw_dataset.columns)
278
+
279
+ logger.info(
280
+ ">> Columns in the dataset: \n\t{}".format(
281
+ "\n\t".join(column_names_arr)
282
+ )
283
+ )
284
+
285
+ return n_rows, n_columns, column_names_arr
286
+
287
+ def _process_text(self, input_text: str) -> str:
288
+ """
289
+ Method for applying NLP-based techniques on an input text in order
290
+ to prepare it to be used by the embedding algorithm.
291
+
292
+ Parameters
293
+ -----------
294
+ input_text : str
295
+ Variable corresponding to the input text.
296
+
297
+ Returns
298
+ -----------
299
+ processed_text : str
300
+ Processed version of the ``input_text``.
301
+
302
+ Notes
303
+ ----------
304
+ This function will perform the following NLP-based techniques:
305
+
306
+ 1. Make the text lowercase.
307
+ 2. Remove any non-alphanumeric character from the string.
308
+ 3. Remove any stop words from the text.
309
+ """
310
+
311
+ return self.nlp_obj.process_text(input_string=input_text)
312
+
313
+ def clean_dataset(self) -> pd.DataFrame:
314
+ """
315
+ Method for cleaning the raw dataset and create a clean version
316
+ of the dataset.
317
+
318
+ Returns
319
+ ---------
320
+ dataset_clean : pandas.DataFrame
321
+ Clean version of the input dataset, after having gone through
322
+ data-cleaning techniques.
323
+ """
324
+ # --- Start time
325
+ logger.info(">> Data cleaning process ...")
326
+ start_time = datetime.now()
327
+ #
328
+
329
+ # --- Making a copy of the raw dataset
330
+ dataset_df = self.raw_dataset.copy()
331
+
332
+ # --- Data-cleaning techniques
333
+ # Removing duplicates
334
+ dataset_df.drop_duplicates(keep="first", inplace=True)
335
+
336
+ # Removing entries that have 'NaN' in the dataset
337
+ dataset_df.dropna(how="any", inplace=True)
338
+
339
+ # Casting proper data types
340
+ dataset_df = dataset_df.astype(str)
341
+
342
+ # Resetting the index of the dataset
343
+ dataset_df.reset_index(drop=True, inplace=True)
344
+
345
+ # Removing trailing whitespaces
346
+ for colname in [self.document_id_colname, self.title_colname]:
347
+ dataset_df.loc[:, colname] = dataset_df[colname].apply(
348
+ lambda x: x.strip()
349
+ )
350
+
351
+ # Processing content
352
+ dataset_df.loc[:, getattr(self, "clean_content_colname")] = dataset_df[
353
+ getattr(self, "content_colname")
354
+ ].apply(lambda text: self.nlp_obj.process_text(text))
355
+
356
+ # --- Saving to disk, if applicable
357
+ if self.save_to_disk:
358
+ dataset_filepath = self.datasets_dir.joinpath("clean_dataset.csv")
359
+ dataset_filepath.parent.mkdir(exist_ok=True, parents=True)
360
+ dataset_df.to_csv(dataset_filepath, header=True, index=True)
361
+
362
+ logger.info(f">> Clean dataset saved to '{str(dataset_filepath)}'")
363
+
364
+ # --- End time
365
+ end_time = datetime.now()
366
+ logger.info(f">> Finished at: {end_time}")
367
+ logger.info(f">> Took: {end_time - start_time}")
368
+ logger.info(">> Data cleaning process ... DONE")
369
+
370
+ return dataset_df
371
+
372
+ def push_dataset_to_hub(
373
+ self,
374
+ dataset: pd.DataFrame,
375
+ dataset_name: str,
376
+ username: Optional[Union[None, str]] = None,
377
+ ):
378
+ """
379
+ Method for pushing the ``dataset`` to the HuggingFace's Hub.
380
+
381
+ Parameters
382
+ -------------
383
+ dataset : pandas.DataFrame
384
+ Dataset that will be pushed to HuggingFace.
385
+
386
+ dataset_name : str
387
+ Name of the dataset to use.
388
+
389
+ username : str, NoneType, optional
390
+ Us
391
+ """
392
+ # Initializing class object
393
+ hf_obj = hf.HuggingFaceHelper()
394
+
395
+ # Transforming dataset type
396
+ hf_dataset = hf_obj.convert_dataframe_to_dataset(input_df=dataset)
397
+
398
+ # Push dataset to hub
399
+ hf_obj.push_dataset(
400
+ dataset=hf_dataset,
401
+ dataset_name=dataset_name,
402
+ username=username,
403
+ )
src/classes/hugging_face_utils.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ """
24
+ Module that includes utilities for interacting with HuggingFace
25
+ """
26
+
27
+ import logging
28
+ import os
29
+ from typing import Dict, Optional, Union
30
+
31
+ import pandas as pd
32
+ from datasets import Dataset, load_dataset
33
+ from huggingface_hub import HfApi
34
+
35
+ from src.utils import default_variables as dv
36
+
37
+ __all__ = ["HuggingFaceHelper"]
38
+
39
+
40
+ logger = logging.getLogger(__name__)
41
+ logging.basicConfig(level=logging.INFO)
42
+ logger.setLevel(level=logging.INFO)
43
+
44
+
45
+ class HuggingFaceHelper(object):
46
+ """
47
+ Class definition for creating, interacting, and sharing Datasets.
48
+ """
49
+
50
+ def __init__(self, **kwargs: Dict) -> None:
51
+ """
52
+ Class definition for creating, interacting, and sharing Datasets.
53
+ """
54
+ # Name of the HuggingFace token as stored in the user's environment
55
+ self.token_name = kwargs.get("token_name", dv.hugging_face_token_name)
56
+ self.username = kwargs.get(
57
+ "username",
58
+ os.environ.get(dv.hugging_face_username_name),
59
+ )
60
+
61
+ # HuggingFace endpoint
62
+ self.api_endpoint = "https://huggingface.co"
63
+ self.api = self._authenticate_api()
64
+
65
+ def _authenticate_api(self) -> HfApi:
66
+ """
67
+ Method for authenticating with HuggingFace using an authentication
68
+ token.
69
+
70
+ Returns
71
+ ---------
72
+ huggingface_api : huggingface_hub.hf_api.HfApi
73
+ Object corresponding to the HuggingFace API after authentication.
74
+ """
75
+ # Check that token is part of the user's environment
76
+ if not os.environ.get(self.token_name):
77
+ msg = f">>> HuggingFace API Token '{self.token_name}' not defined!"
78
+ logger.error(msg)
79
+ raise ValueError(msg)
80
+
81
+ # Initializing API object
82
+ return HfApi(
83
+ endpoint=self.api_endpoint,
84
+ token=os.environ.get(self.token_name),
85
+ )
86
+
87
+ def convert_dataframe_to_dataset(
88
+ self,
89
+ input_df: pd.DataFrame,
90
+ ) -> Dataset:
91
+ """
92
+ Function to convert an existing DataFrame into a ``Dataset`` object
93
+
94
+ Parameters
95
+ -------------
96
+ input_df : pandas.DataFrame
97
+ Variable corresponding to the DataFrame to convert.
98
+
99
+ Returns
100
+ -----------
101
+ dataset_obj : datasets.Dataset
102
+ Dataset object with the same data as ``input_df``.
103
+ """
104
+
105
+ return Dataset.from_pandas(df=input_df)
106
+
107
+ def get_dataset_from_hub(
108
+ self,
109
+ dataset_name: str,
110
+ username: Optional[Union[None, str]] = None,
111
+ split: Optional[Union[None, str]] = None,
112
+ ) -> Dataset:
113
+ # sourcery skip: extract-duplicate-method, use-fstring-for-formatting
114
+ """
115
+ Method for extracting the Dataset from HuggingFace.
116
+
117
+ Parameters
118
+ ------------
119
+ dataset_name : str
120
+ Name of the dataset to extract from HuggingFace's Hub.
121
+
122
+ username : str, NoneType, optional
123
+ Username to use when extracting the dataset from HuggingFace Hub.
124
+ This variable is set to ``None`` by default.
125
+
126
+ split : str, NoneType, optional
127
+ Type of ``split`` to load for the Dataset. If ``None``, the
128
+ method will extract all splits. This variable is set to
129
+ ``None`` by default.
130
+
131
+ Returns
132
+ --------
133
+ dataset_obj : datasets.Dataset
134
+ Variable corresponding to the dataset that was extracted
135
+ from the HuggingFace Hub.
136
+ """
137
+ # 'dataset_name' - Type
138
+ dataset_name_type_arr = (str,)
139
+ if not isinstance(dataset_name, dataset_name_type_arr):
140
+ msg = (
141
+ ">> 'dataset_name' ({}) is not a valid input type ({})".format(
142
+ type(dataset_name),
143
+ dataset_name_type_arr,
144
+ )
145
+ )
146
+ logger.error(msg)
147
+ raise TypeError(msg)
148
+ # 'username' - Type
149
+ username_type_arr = (str, type(None))
150
+ if not isinstance(username, username_type_arr):
151
+ msg = ">> 'username' ({}) is not a valid input type ({})".format(
152
+ type(username),
153
+ username_type_arr,
154
+ )
155
+ logger.error(msg)
156
+ raise TypeError(msg)
157
+ # 'split' - Type
158
+ split_type_arr = (str, type(None))
159
+ if not isinstance(split, split_type_arr):
160
+ msg = ">> 'split' ({}) is not a valid input type ({})".format(
161
+ type(split),
162
+ split_type_arr,
163
+ )
164
+ logger.error(msg)
165
+ raise TypeError(msg)
166
+
167
+ # Defining the path to the dataset in HF.
168
+ dataset_path = (
169
+ f"{username}/{dataset_name}" if username else dataset_name
170
+ )
171
+
172
+ return load_dataset(dataset_path, split=split)
173
+
174
+ def push_dataset(
175
+ self,
176
+ dataset: Dataset,
177
+ dataset_name: str,
178
+ username: Optional[Union[None, str]] = None,
179
+ ): # sourcery skip: extract-duplicate-method, use-fstring-for-formatting
180
+ """
181
+ Method for pushing an existing local Dataset to HuggingFace.
182
+ """
183
+ # --- Check input type
184
+ # 'dataset' - Type
185
+ dataset_type_arr = (Dataset,)
186
+ if not isinstance(dataset, dataset_type_arr):
187
+ msg = ">> 'dataset' ({}) is not a valid input type ({})".format(
188
+ type(dataset),
189
+ dataset_type_arr,
190
+ )
191
+ logger.error(msg)
192
+ raise TypeError(msg)
193
+ # 'dataset_name' - Type
194
+ dataset_name_type_arr = (str,)
195
+ if not isinstance(dataset_name, dataset_name_type_arr):
196
+ msg = (
197
+ ">> 'dataset_name' ({}) is not a valid input type ({})".format(
198
+ type(dataset_name),
199
+ dataset_name_type_arr,
200
+ )
201
+ )
202
+ logger.error(msg)
203
+ raise TypeError(msg)
204
+ # 'username' - Type
205
+ username_type_arr = (str, type(None))
206
+ if not isinstance(username, username_type_arr):
207
+ msg = ">> 'username' ({}) is not a valid input type ({})".format(
208
+ type(username),
209
+ username_type_arr,
210
+ )
211
+ logger.error(msg)
212
+ raise TypeError(msg)
213
+
214
+ # Defining the path to the dataset in HF.
215
+ dataset_path = (
216
+ f"{username}/{dataset_name}" if username else dataset_name
217
+ )
218
+
219
+ # Pushing dataset to HuggingFace
220
+ dataset.push_to_hub(
221
+ repo_id=dataset_path,
222
+ token=os.environ.get(self.token_name),
223
+ )
src/classes/semantic_search_engine.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ import logging
24
+ from typing import Dict, Optional
25
+
26
+ import numpy as np
27
+ import pandas as pd
28
+ import torch
29
+ from datasets import Dataset
30
+ from sentence_transformers import SentenceTransformer
31
+
32
+ from src.utils import default_variables as dv
33
+
34
+ __author__ = ["Victor Calderon"]
35
+ __copyright__ = ["Copyright 2023 Victor Calderon"]
36
+ __all__ = ["SemanticSearchEngine"]
37
+
38
+ logger = logging.getLogger(__name__)
39
+ logging.basicConfig(
40
+ level=logging.INFO,
41
+ format="%(asctime)s [%(levelname)s]: %(message)s",
42
+ )
43
+ logger.setLevel(logging.INFO)
44
+
45
+ # --------------------------- CLASS DEFINITIONS -------------------------------
46
+
47
+
48
+ class SemanticSearchEngine(object):
49
+ """
50
+ Class object for running Semantic Search on the input dataset.
51
+ """
52
+
53
+ def __init__(self, **kwargs):
54
+ """
55
+ Class object for running Semantic Search on the input dataset.
56
+ """
57
+ # --- Defining variables
58
+ # Device to use, i.e. CPU or GPU
59
+ self.device = self._get_device()
60
+ # Embedder model to use
61
+ self.model = "paraphrase-mpnet-base-v2"
62
+ # Defining the embedder
63
+ self.embedder = self._get_embedder()
64
+
65
+ # Corpus embeddings
66
+ self.source_colname = kwargs.get(
67
+ "source_colname",
68
+ "summary",
69
+ )
70
+ self.embeddings_colname = kwargs.get(
71
+ "embeddings_colname",
72
+ dv.embeddings_colname,
73
+ )
74
+
75
+ # Variables used for running semantic search
76
+ self.corpus_dataset_with_faiss_index = kwargs.get(
77
+ "corpus_dataset_with_faiss_index"
78
+ )
79
+
80
+ def _get_device(self) -> str:
81
+ """
82
+ Method for determining the device to use.
83
+
84
+ Returns
85
+ ----------
86
+ device_type : str
87
+ Type of device to use (e.g. 'cpu' or 'cuda').
88
+
89
+ Options:
90
+ - ``cpu`` : Uses a CPU.
91
+ - ``cuda`` : Uses a GPU.
92
+ """
93
+ # Determining the type of device to use
94
+ device_type = "cuda" if torch.cuda.is_available() else "cpu"
95
+
96
+ logger.info(f">> Running on a '{device_type.upper()}' device")
97
+
98
+ return device_type
99
+
100
+ def _get_embedder(self):
101
+ """
102
+ Method for extracting the Embedder model.
103
+
104
+ Returns
105
+ ---------
106
+ embedder : model
107
+ Variable corresponding to the Embeddings models.
108
+ """
109
+ embedder = SentenceTransformer(self.model)
110
+ embedder.to(self.device)
111
+
112
+ return embedder
113
+
114
+ def generate_corpus_index_and_embeddings(
115
+ self,
116
+ corpus_dataset: Dataset,
117
+ ) -> Dataset:
118
+ """
119
+ Method for generating the Text Embeddings and FAISS indices from
120
+ the input dataset.
121
+
122
+ Parameters
123
+ ------------
124
+ corpus_dataset : datasets.Dataset
125
+ Dataset containing the text to use to create the text
126
+ embeddings and FAISS indices.
127
+
128
+ Returns
129
+ ----------
130
+ corpus_dataset_with_embeddings : datasets.Dataset
131
+ Dataset containing the original data rom ``corpus_dataset``
132
+ plus the corresponding text embeddings of the ``source_colname``
133
+ column.
134
+ """
135
+ torch.set_grad_enabled(False)
136
+
137
+ # --- Generate text embeddings for the source column
138
+ corpus_dataset_with_embeddings = corpus_dataset.map(
139
+ lambda corpus: {
140
+ self.embeddings_colname: self.embedder.encode(
141
+ corpus[self.source_colname]
142
+ )
143
+ },
144
+ batched=True,
145
+ desc="Computing Semantic Search Embeddings",
146
+ )
147
+
148
+ # --- Adding FAISS index
149
+ corpus_dataset_with_embeddings.add_faiss_index(
150
+ column=self.embeddings_colname,
151
+ faiss_verbose=True,
152
+ device=None if self.device == "cpu" else 1,
153
+ )
154
+
155
+ return corpus_dataset_with_embeddings
156
+
157
+ def run_semantic_search(
158
+ self,
159
+ query: str,
160
+ top_n: Optional[int] = 5,
161
+ ) -> Dict: # sourcery skip: extract-duplicate-method
162
+ """
163
+ Method for running a semantic search on a query after having
164
+ created the corpus of the text embeddings.
165
+
166
+ Parameters
167
+ --------------
168
+ query : str
169
+ Text query to use for searching the database.
170
+
171
+ top_n : int, optional
172
+ Variable corresponding to the 'Top N' values to return based on the
173
+ similarity score between the input query and the corpus. This
174
+ variable is set to ``10`` by default.
175
+
176
+ Returns
177
+ ---------
178
+ match_results : dict
179
+ Dictionary containing the metadata of each of the articles
180
+ that were in the Top-N in terms of being most similar to the
181
+ input query ``query``.
182
+ """
183
+ # --- Checking input parameters
184
+ # 'query' - Type
185
+ query_type_arr = (str,)
186
+ if not isinstance(query, query_type_arr):
187
+ msg = ">> 'query' ({}) is not a valid input type ({})".format(
188
+ type(query), query_type_arr
189
+ )
190
+ logger.error(msg)
191
+ raise TypeError(msg)
192
+ # 'top_n' - Type
193
+ top_n_type_arr = (int,)
194
+ if not isinstance(top_n, top_n_type_arr):
195
+ msg = ">> 'top_n' ({}) is not a valid input type ({})".format(
196
+ type(top_n), top_n_type_arr
197
+ )
198
+ logger.error(msg)
199
+ raise TypeError(msg)
200
+
201
+ # 'top_n' - Value
202
+ if top_n <= 0:
203
+ msg = f">> 'top_n' ({top_n}) must be larger than '0'!"
204
+ logger.error(msg)
205
+ raise ValueError(msg)
206
+
207
+ # --- Checking that the encoder has been indexed correctly
208
+ if self.corpus_dataset_with_faiss_index is None:
209
+ msg = ">>> The FAISS index was not properly set!"
210
+ logger.error(msg)
211
+ raise ValueError(msg)
212
+
213
+ # --- Encode the input query and extract the embedding
214
+ query_embedding = self.embedder.encode(query)
215
+
216
+ # --- Extracting the top-N results
217
+ (
218
+ scores,
219
+ results,
220
+ ) = self.corpus_dataset_with_faiss_index.get_nearest_examples(
221
+ self.embeddings_colname,
222
+ query_embedding,
223
+ k=top_n,
224
+ )
225
+
226
+ # --- Sorting from highest to lowest
227
+ # NOTE: We need to deconstruct the 'results' to be able to organize
228
+ # the results
229
+ parsed_results = pd.DataFrame.from_dict(
230
+ data=results,
231
+ orient="columns",
232
+ )
233
+ parsed_results.loc[:, "relevance"] = scores
234
+
235
+ # Sorting in descending order
236
+ parsed_results = parsed_results.sort_values(
237
+ by=["relevance"],
238
+ ascending=False,
239
+ ).reset_index(drop=True)
240
+
241
+ # Casting data type for the 'relevance'
242
+ parsed_results.loc[:, "relevance"] = parsed_results["relevance"].apply(
243
+ lambda x: str(np.round(x, 5))
244
+ )
245
+
246
+ # Only keeping certain columns
247
+ columns_to_keep = ["_id", "title", "relevance", "content"]
248
+
249
+ return parsed_results[columns_to_keep].to_dict(orient="index")
src/data_processing/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
src/data_processing/prepare_dataset.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ """
24
+ Module for preparing the input dataset.
25
+ """
26
+
27
+ import logging
28
+ from pathlib import Path
29
+ from typing import Dict
30
+
31
+ from src.classes import data_preparation as dp
32
+ from src.utils import default_variables as dv
33
+ from src.utils import general_utilities as gu
34
+
35
+ __author__ = ["Victor Calderon"]
36
+ __copyright__ = ["Copyright 2023 Victor Calderon"]
37
+ __all__ = []
38
+
39
+ logger = logging.getLogger(__name__)
40
+ logging.basicConfig(
41
+ level=logging.INFO,
42
+ format="%(asctime)s [%(levelname)s]: %(message)s",
43
+ )
44
+ logger.setLevel(logging.INFO)
45
+
46
+
47
+ # ---------------------------- PROJECT VARIABLES ------------------------------
48
+
49
+ MODULE_DESCRIPTION = "Module for data preparation"
50
+ MODULE_VERSION = "1.0"
51
+
52
+
53
+ # ----------------------------- INPUT PARAMETERS ------------------------------
54
+
55
+
56
+ def get_parser():
57
+ """
58
+ Function to get the input parameters to the script.
59
+ """
60
+ # Defining the 'parser' object to use
61
+ parser = gu._get_parser_obj(description=MODULE_DESCRIPTION)
62
+
63
+ # Path to the input dataset
64
+ parser.add_argument(
65
+ "--dataset-path",
66
+ dest="dataset_path",
67
+ default=dv.cicero_dataset_url,
68
+ type=str,
69
+ help="""
70
+ Path / URL to the input dataset.
71
+ [Default: '%(default)s']
72
+ """,
73
+ )
74
+
75
+ return parser.parse_args()
76
+
77
+
78
+ # ------------------------------- FUNCTIONS ----------------------------------
79
+
80
+
81
+ def _resolve_input_object_path(object_path: str) -> str:
82
+ """
83
+ Check whether or not the path corresponds to a local file or a URL.
84
+
85
+ Parameters
86
+ -------------
87
+ object_path : str
88
+ Path of the input object.
89
+
90
+ Returns
91
+ ----------
92
+ parsed_object_path : str
93
+ Modified / parsed version of the input object ``object_path``.
94
+
95
+ Raises
96
+ ------------
97
+ TypeError ; Error
98
+ This error gets raised whenever the input object is neither
99
+ a 'file' nor a valid 'url'.
100
+ """
101
+ object_type = gu.check_url_or_file_type(object_path=object_path)
102
+
103
+ if object_type == "unspecified":
104
+ msg = (
105
+ f">>> Unspecified data type for '{object_path}' or does not exist"
106
+ )
107
+ logger.error(msg)
108
+ raise TypeError(msg)
109
+
110
+ return (
111
+ object_path
112
+ if object_type == "url"
113
+ else str(Path(object_path).resolve())
114
+ )
115
+
116
+
117
+ def _temp_create_dataset_with_summaries():
118
+ """
119
+ Function to **temporarily** create the Dataset object in HuggingFace
120
+ using the dataset with summaries for each of the articles.
121
+
122
+ Notes
123
+ --------
124
+ This is a temporary solution UNTIL the ``Summarizer`` is put in place.
125
+ """
126
+ # Path to the dataset
127
+ dataset_filepath = str(
128
+ (
129
+ gu.get_project_paths()
130
+ .get("src")
131
+ .joinpath(
132
+ "utils",
133
+ "gpt35_summaries",
134
+ "df_embed_out2.csv",
135
+ )
136
+ ).resolve()
137
+ )
138
+
139
+ # Reading in dataset
140
+ data_prep_obj = dp.DatasetPrep(dataset_path=dataset_filepath)
141
+
142
+ # Uploading it to HuggingFace Hub
143
+ data_prep_obj.push_dataset_to_hub(
144
+ dataset=data_prep_obj.raw_dataset,
145
+ dataset_name=dv.summaries_dataset_name,
146
+ )
147
+
148
+ return
149
+
150
+
151
+ # ------------------------------ MAIN FUNCTIONS -------------------------------
152
+
153
+
154
+ def main(params_dict: Dict):
155
+ """
156
+ Main function to process the data.
157
+ """
158
+ # Determine if the path corresponds to a file or a URL
159
+ params_dict["object_path"] = _resolve_input_object_path(
160
+ params_dict["dataset_path"]
161
+ )
162
+
163
+ # Showing set of input parameters
164
+ gu.show_params(params_dict=params_dict, logger=logger)
165
+
166
+ # Initializing input parameters
167
+ data_prep_obj = dp.DatasetPrep(dataset_path=params_dict["object_path"])
168
+ data_prep_obj.show_params()
169
+ clean_dataset = data_prep_obj.clean_dataset()
170
+
171
+ logger.info(f"\n>>> Raw dataset: \n{data_prep_obj.raw_dataset}\n")
172
+ logger.info(f"\n>>> Clean dataset: \n{clean_dataset}\n")
173
+
174
+ # --- Pushing datasets to HuggingFace Hub
175
+ # 'Raw' dataset
176
+ data_prep_obj.push_dataset_to_hub(
177
+ dataset=data_prep_obj.raw_dataset,
178
+ dataset_name=dv.raw_dataset_name,
179
+ )
180
+ # 'Clean' dataset
181
+ data_prep_obj.push_dataset_to_hub(
182
+ dataset=clean_dataset,
183
+ dataset_name=dv.clean_dataset_name,
184
+ )
185
+
186
+ # Dataset with summaries
187
+ _temp_create_dataset_with_summaries()
188
+
189
+ return
190
+
191
+
192
+ if __name__ == "__main__":
193
+ # Getting input parameters
194
+ params_dict = vars(get_parser())
195
+ # Running main function
196
+ main(params_dict=params_dict)
src/focused_summary_example.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.gpt35_summaries.summarizer import Summarizer
2
+
3
+ s = Summarizer()
4
+ article_text = """
5
+ div g alt rchttpswwwaeiorgwpcontentuploads202010thatcherpngx91208 iv pa relnoreferrer noopener efhttpsenwikipediaorgwikimargaret_thatcher margaret hilda pictured above was born on this day october 13 in 1925 and today would have been her birthday unfortunately she died on april 8 2013 at the age of 87 to honor baroness thatcher on her birthday here is my annual tribute to the iron lady in recognition of her significant contributions during her political career including serving as the prime minister of the k from 1979 to 1990 below are some videos quotations and related articles to celebrate prime minister margaret thatchers birthday and her rich legacy defending liberty and freedom and fighting socialism p llowfullscreen width640 height350iframecenter 1 the video above features margaret thatchers address to the conservative party conference in 1983 when she classwpblockquoteplet us never forget this fundamental truth the state has no source of money other than the money people earn themselves if the state wishes to spend more it can do so only by borrowing your savings or by taxing you more ere is no such thing as public money there is only taxpayers te m a very fundamental truth that is frequently forgotten any time you see or hear the terms public funding public funds government funding or government funds be sure to substitute taxpayer funding and taxpayer p 2 here are the five reasons margaret thatcher is still an inspiration to women trong via the a relnoreferrer noopener efhttpiwforgblog2804951topfivereasonsmargaretthatcherisstillaninspirationtowomentoday ndependent on margaret thatchers birthday in 2017 summarized he didnt use her sex to influence her he was he challenged the status he had to work for her he was a modern pp llowfullscreen rchttpswwwyoutubecomembedrv5t6rc6yvg width640 height350iframecenter p 3 the video above is margaret thatchers last house of commons speech on november 22 1990 which is known as thatchers last stand against socialism heres the a relnoreferrer noopener efhttpswwwmargaretthatcherorgdocument108256 full and heres a think that the hon gentleman knows that i have the same contempt for his socialist policies as the people of east europe who have experienced them have for theirs i think that i must have hit the right nail on the head when i pointed out that the logic of those policies is that they would rather the poor were poorer once they start to talk about the gap they would rather that the gap were tem[indicating[emdown here not [indicating[embutem[indicatingem] so long as the gap is smaller they would rather have the poor poorer one does not create wealth and opportunity that way one does not create a ropertyowning democracy that way 4 here are 10 great margaret thatcher quotes which are just as relevant and timely for america today if not more so than they were for the k more than a ercentury ago listen up marx oc thanks to larry reed for a relnoreferrer noopener efhttpswwwfeeorgarticlesmargaretthatcheronsocialism20ofherbestquotesfbclidiwar0ypr1qt8cco_rft4xqrr33_ebujx0aymff6mfhh149d_1uwtiyagjoblk ome of these classwpblockquotep1 the problem with socialism is that you eventually run out of other peoples moneypp2 do you know that one of the great problems of our age is that we are governed by people who care more about feelings than they do about thoughts and deaspp3 i think weve been through a period where too many people have been given to understand that if they have a problem its the governments job to cope with it i have a problem ill get a grant im homeless the government must house me theyre casting their problem on society and you know there is no such thing as society there are individual men and women and there are families and no government can do anything except through people and people must look to themselves first its our duty to look after ourselves and then also to look after our neighbor people have got the too much in mind without the obligations theres no such thing as entitlement unless someone has first met an obligationpp4 no one would remember the good samaritan if hed only had good intentions he had money as wellpp5 the philosophical reason for which we are against nationalization and for private enterprise is because we believe that economic progress comes from the inventiveness ability determination and the pioneering spirit of extraordinary men and women if they cannot exercise that spirit here they will go away to another free enterprise country which will then make more economic progress than we do we ought in fact to be encouraging small firms and small companies because the extent to which innovation comes through these companies is endouspp6 our challenge is to create the kind of economic background which enables private initiative and private enterprise to flourish for the benefit of the consumer employee the pensioner and society as a believe we should judge people on merit and not on background i believe the person who is prepared to work hardest should get the greatest rewards and keep them after tax that we should back the workers and not the shirkers that it is not only permissible but praiseworthy to want to benefit your own family by your own effortspp7 i place a profound eliefindeed a fervent faithin the virtues of elfreliance and personal independence on these is founded the whole case for the free society for the assertion that human progress is best achieved by offering the freest possible scope for the development of individual talents qualified only by a respect for the qualities and the freedom of othersfor many years there has been a subtle erosion of the essential virtues of the free society elfreliance has been sneered at as if it were an absurd suburban pretention thrift has been denigrated as if it were greed the desire of parents to choose and to struggle for what they themselves regarded as the best possible education for their children has been cornedpp8 what are the lessons then that weve learned from the last thirty years first that the pursuit of equality itself is a mirage whats more desirable and more practicable than the pursuit of equality is the pursuit of equality of opportunity and opportunity means nothing unless it includes the right to be unequal and the freedom to be different one of the reasons that we value individuals is not because theyre all the same but because theyre all different i believe you have a saying in the middle west dont cut down the tall poppies let them rather grow tall i would say let our children grow tall and some taller than others if they have the ability in them to do so because we must build a society in which each citizen can develop his full potential both for his own benefit and for the community as a whole a society in which originality skill energy and thrift are rewarded in which we encourage rather than restrict the variety and richness of human naturepp9 some socialists seem to believe that people should be numbers in a state computer we believe they should be individuals we are all unequal no one thank heavens is like anyone else however much the socialists may pretend otherwise we believe that everyone has the right to be unequal but to us every human being is equally mportantpp10 there is no such thing as safe socialism if its safe its not socialism and if its socialism its not safe the signposts of socialism point downhill to less freedom less prosperity downhill to more muddle more failure if we follow them to their destination they will lead this nation into lockquote happy birthday margaret thatcher p e post a efhttpswwwaeiorgcarpediemhappy96thbirthdaymargaretthatcherhappy birthday margaret appeared first on a efhttpswwwaeiorgamerican enterprise institute
6
+ """
7
+ search_string = "most significant contribution of thatcher"
8
+
9
+ print("Focussed summary for search string", f'"{search_string}":')
10
+ print(s._run_model(article_text, search_string))
11
+
12
+ print("Vanilla summary:")
13
+ print(s._run_model(article_text))
14
+
15
+ # Example output:s
16
+ # Focussed summary for search string "most significant contribution of thatcher":
17
+ # In commemoration of former British Prime Minister Margaret Thatcher's birthday, which falls on October 13, American Enterprise Institute pays tribute to her political career and significant contributions, including serving as the Prime Minister of the UK from 1979 to 1990. The article features various videos, quotations, and articles that celebrate Thatcher's rich legacy of defending freedom, liberty, and fighting socialism. The article lists several of Thatcher's speeches, quotes, and her last stand against socialism.
18
+ # Overall, the article does not explicitly state Thatcher's most significant contribution, but it heavily implies that Thatcher's significant contributions were her defense of liberty and freedom and her fight against socialism.
19
+ # Vanilla summary:
20
+ # To commemorate Margaret Thatcher's birthday, born on October 13 in 1925, AEI presents an annual tribute to the Iron Lady. Thatcher's contribution during her political career and her rich legacy defending liberty and freedom and fighting socialism are celebrated. The article provides several videos, quotations, and related articles to commemorate the Prime Minister's birthday and her lasting influence as a female leader in the UK. Among Thatcher's inspirational quotes, she advocated for private enterprise as a means of economic progress, and emphasized the importance of individual responsibility, merit, and personal independence in building a free society.
src/training/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
src/training/create_faiss_corpus_index.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ import logging
24
+ from pathlib import Path
25
+ from typing import Dict
26
+
27
+ from src.classes import hugging_face_utils as hf
28
+ from src.classes import semantic_search_engine as ss
29
+ from src.utils import default_variables as dv
30
+ from src.utils import general_utilities as gu
31
+
32
+ __author__ = ["Victor Calderon"]
33
+ __copyright__ = ["Copyright 2023 Victor Calderon"]
34
+ __all__ = []
35
+
36
+ logger = logging.getLogger(__name__)
37
+ logging.basicConfig(
38
+ level=logging.INFO,
39
+ format="%(asctime)s [%(levelname)s]: %(message)s",
40
+ )
41
+ logger.setLevel(logging.INFO)
42
+
43
+ # ---------------------------- PROJECT VARIABLES ------------------------------
44
+
45
+ MODULE_DESCRIPTION = "Module for data preparation"
46
+ MODULE_VERSION = "1.0"
47
+
48
+
49
+ # ----------------------------- INPUT PARAMETERS ------------------------------
50
+
51
+
52
+ def get_parser():
53
+ """
54
+ Function to get the input parameters to the script.
55
+ """
56
+ # Defining the 'parser' object to use
57
+ parser = gu._get_parser_obj(description=MODULE_DESCRIPTION)
58
+
59
+ # Path to the input dataset
60
+ parser.add_argument(
61
+ "--dataset-name",
62
+ dest="dataset_name",
63
+ default=dv.summaries_dataset_name,
64
+ type=str,
65
+ help="""
66
+ Name of the HuggingFace dataset
67
+ [Default: '%(default)s']
68
+ """,
69
+ )
70
+ # Name of the output Dataset with FAISS index and embeddings
71
+ parser.add_argument(
72
+ "--output-dataset-name",
73
+ dest="output_dataset_name",
74
+ default=dv.dataset_faiss_embeddings_name,
75
+ type=str,
76
+ help="""
77
+ Name of the output dataset that will contain a FAISS index the
78
+ text embeddings of the summaries.
79
+ [Default: '%(default)s']
80
+ """,
81
+ )
82
+ # Name of the HuggingFace repository
83
+ parser.add_argument(
84
+ "--repository-name",
85
+ dest="repository_name",
86
+ default=dv.hugging_face_repository_name,
87
+ type=str,
88
+ help="""
89
+ Name of the HuggingFace repository to use for storing artifacts.
90
+ [Default: '%(default)s']
91
+ """,
92
+ )
93
+ # Name of the FAISS Index
94
+ parser.add_argument(
95
+ "--faiss-index-name",
96
+ dest="faiss_index_name",
97
+ default=dv.faiss_index_name,
98
+ type=str,
99
+ help="""
100
+ Name of the FAISS Index of the output dataset.
101
+ [Default: '%(default)s']
102
+ """,
103
+ )
104
+
105
+ return parser.parse_args()
106
+
107
+
108
+ # ------------------------------- FUNCTIONS ----------------------------------
109
+
110
+
111
+ def create_faiss_index_and_embeddings_from_dataset(params_dict: Dict):
112
+ """
113
+ Function to create a Dataset object with a FAISS index and the
114
+ corresponding text embeddings.
115
+
116
+ Parameters
117
+ -----------
118
+ params_dict : dict
119
+ Dictionary with set of parameters that are used throughout the project.
120
+ """
121
+ # --- Initializing object for interacting with Datasets
122
+ hf_obj = hf.HuggingFaceHelper()
123
+
124
+ # --- Download dataset from HuggingFace Hub
125
+ dataset_obj = hf_obj.get_dataset_from_hub(
126
+ dataset_name=params_dict["dataset_name"],
127
+ username=hf_obj.username,
128
+ split="train",
129
+ )
130
+
131
+ # --- Generate the FAISS index and Text embeddings
132
+ # Initialize Semantic Search engine
133
+ semantic_search_obj = ss.SemanticSearchEngine()
134
+
135
+ # Create FAISS index and the dataset with text embeddings
136
+ dataset_with_embeddings_obj = (
137
+ semantic_search_obj.generate_corpus_index_and_embeddings(
138
+ corpus_dataset=dataset_obj
139
+ )
140
+ )
141
+
142
+ # --- Extract FAISS index and upload it to HuggingsFace Hub
143
+ # Path to the output file that will contain the FAISS index
144
+ faiss_index_local_path = str(
145
+ gu.get_project_paths()["data"].joinpath(
146
+ f'{params_dict["faiss_index_name"]}.faiss'
147
+ )
148
+ )
149
+
150
+ dataset_with_embeddings_obj.save_faiss_index(
151
+ index_name=semantic_search_obj.embeddings_colname,
152
+ file=faiss_index_local_path,
153
+ )
154
+
155
+ # Creating repository in HuggingFace
156
+ repo_name = f'{hf_obj.username}/{params_dict["repository_name"]}'
157
+ repo_type = "dataset"
158
+
159
+ _ = hf_obj.api.create_repo(
160
+ repo_id=repo_name,
161
+ repo_type=repo_type,
162
+ exist_ok=True,
163
+ )
164
+
165
+ # Uploading FAISS
166
+ hf_obj.api.upload_file(
167
+ path_or_fileobj=faiss_index_local_path,
168
+ path_in_repo=Path(faiss_index_local_path).name,
169
+ repo_id=repo_name,
170
+ repo_type=repo_type,
171
+ )
172
+
173
+ # --- Upload new Dataset to HuggingFace
174
+ # Dropping FAISS index
175
+ dataset_with_embeddings_obj.drop_index(
176
+ index_name=semantic_search_obj.embeddings_colname
177
+ )
178
+
179
+ # Pushing dataset to HuggingFace
180
+ hf_obj.push_dataset(
181
+ dataset=dataset_with_embeddings_obj,
182
+ dataset_name=params_dict["output_dataset_name"],
183
+ username=hf_obj.username,
184
+ )
185
+
186
+ return
187
+
188
+
189
+ # ------------------------------ MAIN FUNCTIONS -------------------------------
190
+
191
+
192
+ def main(params_dict: Dict):
193
+ """
194
+ Main function for creating a dataset with FAISS index.
195
+ """
196
+ # Showing set of input parameters
197
+ gu.show_params(params_dict=params_dict, logger=logger)
198
+
199
+ # Create FAISS index and Text embeddings for the dataset.
200
+ create_faiss_index_and_embeddings_from_dataset(params_dict=params_dict)
201
+
202
+ return
203
+
204
+
205
+ if __name__ == "__main__":
206
+ # Getting input parameters
207
+ params_dict = vars(get_parser())
208
+ # Running main function
209
+ main(params_dict=params_dict)
src/utils/.DS_Store ADDED
Binary file (6.15 kB). View file
 
src/utils/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
src/utils/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (182 Bytes). View file
 
src/utils/__pycache__/default_variables.cpython-39.pyc ADDED
Binary file (1.05 kB). View file
 
src/utils/default_variables.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ """
24
+ Module containing the set of default variables of the project.
25
+ """
26
+
27
+ # Option for saving the output data to disk
28
+ save_to_disk = True
29
+
30
+ # URL to the CICERO dataset
31
+ cicero_dataset_url = "https://raw.githubusercontent.com/hamzafarooq/maven-mlsystem-design-cohort-1/main/data/df_embed.csv" # noqa: E501
32
+
33
+ # Option for saving to disk
34
+ save_to_disk = True
35
+
36
+ # Name of the column that corresponds to the Document ID
37
+ document_id_colname = "_id"
38
+
39
+ # Name of the column that corresponds to the title of the document.
40
+ title_colname = "title"
41
+
42
+ # Name of the column that contains the content of the document.
43
+ content_colname = "content"
44
+
45
+ # Name of teh target column name that will contain the parsed / clean version
46
+ # of the document's content.
47
+ clean_content_colname = "clean_content"
48
+
49
+ # Name of the 'raw' dataset
50
+ raw_dataset_name = "cicero_raw_dataset"
51
+
52
+ # Name of the 'clean' dataset
53
+ clean_dataset_name = "cicero_clean_dataset"
54
+
55
+ # Name of the dataset with summaries
56
+ summaries_dataset_name = "cicero_dataset_with_summaries"
57
+
58
+ # Name of the dataaset with embeddings and FAISS index
59
+ dataset_faiss_embeddings_name = (
60
+ "cicero_dataset_with_embeddings_and_faiss_index"
61
+ )
62
+
63
+ # Name of the environment variable with the HuggingFace Token
64
+ hugging_face_token_name = "HUGGING_FACE_HUB_TOKEN"
65
+
66
+ # Name of the environment variable with the HuggingFace Username
67
+ hugging_face_username_name = "HUGGING_FACE_USERNAME"
68
+
69
+ # Name of the HuggingFace repository
70
+ hugging_face_repository_name = "cicero_synthesizer"
71
+
72
+ # Name of the FAISS Index
73
+ faiss_index_name = "cicero_faiss_index"
74
+
75
+ # Name of the column that contains the embedding in the dataset
76
+ embeddings_colname = "embeddings"
src/utils/general_utilities.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MIT License
2
+ #
3
+ # Copyright (c) 2023 Victor Calderon
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ """
24
+ Module that includes general utitlity functions.
25
+ """
26
+
27
+ import argparse as argparse
28
+ import logging as logging
29
+ import re
30
+ from argparse import ArgumentParser as _ArgumentParser
31
+ from argparse import HelpFormatter as _HelpFormatter
32
+ from operator import attrgetter as _attrgetter
33
+ from pathlib import Path
34
+ from typing import Dict, List, Optional, Union
35
+
36
+ import numpy as np
37
+
38
+ logger = logging.getLogger(__name__)
39
+ logging.basicConfig(level=logging.INFO)
40
+ logger.setLevel(level=logging.INFO)
41
+
42
+
43
+ __all__ = ["get_project_paths"]
44
+
45
+
46
+ def _get_root_dir():
47
+ """
48
+ Function for determining the path to the root directory of the project.
49
+
50
+ Returns
51
+ ----------
52
+ root_dir : str
53
+ Path to the root directory of the project.
54
+ """
55
+
56
+ return str(list(Path(__file__).resolve().parents)[2].resolve())
57
+
58
+
59
+ def get_project_paths() -> Dict[str, Path]:
60
+ """
61
+ Function to extract the set of directories of the project.
62
+
63
+ Returns
64
+ ----------
65
+ proj_dict : dict
66
+ Dictionary containing the path to the project's directories.
67
+ """
68
+ # --- Defining set of directories
69
+ # Base directory
70
+ base_dir = Path(_get_root_dir())
71
+ # Data directory
72
+ data_dir = base_dir.joinpath("data").resolve()
73
+ # Source directory / Codebase
74
+ src_dir = base_dir.joinpath("src").resolve()
75
+
76
+ # --- Creating project dictionary with the project directories
77
+ proj_dict = {
78
+ "base": base_dir,
79
+ "data": data_dir,
80
+ "src": src_dir,
81
+ }
82
+
83
+ # --- Making sure the directories exist
84
+ for directory in proj_dict.values():
85
+ directory.mkdir(
86
+ exist_ok=True,
87
+ parents=True,
88
+ )
89
+
90
+ return proj_dict
91
+
92
+
93
+ def is_float(s: str):
94
+ """
95
+ Function that checks whether or not ``s` is a string.
96
+ """
97
+ return s.count(".") == 1 and s.replace(".", "").isdigit()
98
+
99
+
100
+ def _str2bool(v):
101
+ if v.lower() in ("yes", "true", "t", "y", "1"):
102
+ return True
103
+ elif v.lower() in ("no", "false", "f", "n", "0"):
104
+ return False
105
+ else:
106
+ raise argparse.ArgumentTypeError("Boolean value expected.")
107
+
108
+
109
+ class SortingHelpFormatter(_HelpFormatter):
110
+ def add_arguments(self, actions):
111
+ """
112
+ Modifier for `argparse` help parameters, that sorts them alphabetically
113
+ """
114
+ actions = sorted(actions, key=_attrgetter("option_strings"))
115
+ super(SortingHelpFormatter, self).add_arguments(actions)
116
+
117
+
118
+ def _get_parser_obj(description: str):
119
+ """
120
+ Function to create an 'argparse' ``parser`` object.
121
+ """
122
+
123
+ return _ArgumentParser(
124
+ description=description,
125
+ formatter_class=SortingHelpFormatter,
126
+ )
127
+
128
+
129
+ def show_params(
130
+ params_dict: Dict,
131
+ logger: logging.Logger,
132
+ columns_to_omit: Optional[Union[List, None]] = None,
133
+ ):
134
+ """
135
+ Function to show the defined of the class.
136
+ """
137
+ # Checking input parameters
138
+ columns_to_omit = columns_to_omit or []
139
+ #
140
+ msg = "\n" + "-" * 50 + "\n"
141
+ msg += "\t---- INPUT PARAMETERS ----" + "\n"
142
+ msg += "" + "\n"
143
+ # Sorting keys of dictionary
144
+ keys_sorted = np.sort(list(params_dict.keys()))
145
+ for key_ii in keys_sorted:
146
+ if key_ii not in columns_to_omit:
147
+ msg += f"\t>>> {key_ii} : {params_dict[key_ii]}\n"
148
+ #
149
+ msg += "\n" + "-" * 50 + "\n"
150
+ logger.info(msg)
151
+
152
+ return
153
+
154
+
155
+ def check_url_or_file_type(object_path: str) -> str:
156
+ """
157
+ Function to determine whether the input variable is a file or a URL.
158
+
159
+ Parameters
160
+ ------------
161
+ object_path : str
162
+ Path to the object.
163
+
164
+ Returns
165
+ ------------
166
+ object_type : str
167
+ Type of the object.
168
+
169
+ Options :
170
+ - `url` : The object is a valid URL
171
+ - `file` : The object corresponds to a local file.
172
+ - `unspecified` : This object is neither a file nor a URL.
173
+ """
174
+ # Set of regular expressions for each type
175
+ url_pattern = r"^https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/?\S*$"
176
+
177
+ if re.match(url_pattern, object_path):
178
+ return "url"
179
+
180
+ # Checking if 'object_path' is a file or directory
181
+ return "file" if Path(object_path).is_file() else "unspecified"
src/utils/gpt35_summaries/__init__.py ADDED
File without changes
src/utils/gpt35_summaries/cleanup_and_summarize.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sourcery skip: use-named-expression
2
+ import csv
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from .summarizer import Summarizer
7
+
8
+ P_SCRIPT_DIR = Path(__file__).parent
9
+
10
+ csv.field_size_limit(sys.maxsize)
11
+
12
+
13
+ HTML_STRINGS = []
14
+ with open(P_SCRIPT_DIR / "html_tags.txt") as f:
15
+ for line in f:
16
+ tag = line.strip()
17
+ if tag:
18
+ HTML_STRINGS.append(tag)
19
+ HTML_STRINGS.extend(("target_blank", "relnoopen", "relnofollow"))
20
+ HTML_STRINGS = tuple(HTML_STRINGS)
21
+
22
+ WORDS = set()
23
+ with open(P_SCRIPT_DIR / "words_alpha.txt") as f:
24
+ for line in f:
25
+ word = line.strip()
26
+ if word:
27
+ WORDS.add(word)
28
+
29
+
30
+ def filter_content(content):
31
+ c_words = content.split()
32
+ c_filt_words = []
33
+ for w in c_words:
34
+ if w not in WORDS:
35
+ while w.startswith(HTML_STRINGS):
36
+ smax = ""
37
+ for s in HTML_STRINGS:
38
+ if w.startswith(s) and len(s) > len(smax):
39
+ smax = s
40
+ w = w[len(smax) :] # noqa: E203
41
+ while w.endswith(HTML_STRINGS):
42
+ smax = ""
43
+ for s in HTML_STRINGS:
44
+ if w.endswith(s) and len(s) > len(smax):
45
+ smax = s
46
+ w = w[len(smax) :] # noqa: E203
47
+ if w:
48
+ c_filt_words.append(w)
49
+ return " ".join(c_filt_words)
50
+
51
+
52
+ def main():
53
+ DF_EMBED_OUT_DICT = {}
54
+ if (P_SCRIPT_DIR / "df_embed_out.csv").exists():
55
+ with open(
56
+ P_SCRIPT_DIR / "df_embed_out.csv",
57
+ encoding="ascii",
58
+ errors="ignore",
59
+ ) as fin:
60
+ for csv_row in csv.DictReader(fin):
61
+ DF_EMBED_OUT_DICT[csv_row["_id"]] = csv_row
62
+
63
+ SUMMARIZER = Summarizer()
64
+
65
+ with open(
66
+ P_SCRIPT_DIR / "df_embed.csv", encoding="ascii", errors="ignore"
67
+ ) as fin, open(
68
+ P_SCRIPT_DIR / "df_embed_out.csv",
69
+ "w",
70
+ encoding="ascii",
71
+ errors="ignore",
72
+ ) as fout:
73
+ csv_reader = csv.DictReader(fin)
74
+ fieldnames = csv_reader.fieldnames[:]
75
+ fieldnames.append("summary")
76
+ fieldnames.append("content_filtered")
77
+ csv_writer = csv.DictWriter(fout, fieldnames)
78
+ csv_writer.writeheader()
79
+ for csv_row in csv_reader:
80
+ if csv_row["_id"] in DF_EMBED_OUT_DICT:
81
+ print("Re-using existing data for", csv_row["_id"])
82
+ csv_row = DF_EMBED_OUT_DICT[csv_row["_id"]]
83
+ if not csv_row["title"] and not csv_row["content"]:
84
+ csv_row["content"] = csv_row["_id"]
85
+ csv_row["_id"] = ""
86
+ content_filtered = filter_content(csv_row["content"])
87
+ print(content_filtered)
88
+ csv_row["content_filtered"] = content_filtered
89
+ # input()
90
+ if not csv_row.get("summary") and (
91
+ csv_row["title"] or csv_row["content_filtered"]
92
+ ):
93
+ print("Running GPT...\n")
94
+ while True:
95
+ summary = SUMMARIZER.summarize(
96
+ csv_row["title"], content_filtered
97
+ )
98
+ if summary:
99
+ break
100
+ # input()
101
+ csv_row["summary"] = summary
102
+ csv_writer.writerow(csv_row)
103
+ fout.flush()
104
+
105
+
106
+ if __name__ == "__main__":
107
+ main()
src/utils/gpt35_summaries/html_tags.txt ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a
2
+ abbr
3
+ address
4
+ area
5
+ article
6
+ aside
7
+ audio
8
+ b
9
+ base
10
+ bdi
11
+ bdo
12
+ blockquote
13
+ body
14
+ br
15
+ button
16
+ canvas
17
+ caption
18
+ cite
19
+ code
20
+ col
21
+ colgroup
22
+ data
23
+ datalist
24
+ dd
25
+ del
26
+ details
27
+ dfn
28
+ dialog
29
+ div
30
+ dl
31
+ dt
32
+ em
33
+ embed
34
+ fieldset
35
+ figcaption
36
+ figure
37
+ footer
38
+ form
39
+ h1
40
+ h2
41
+ h3
42
+ h4
43
+ h5
44
+ h6
45
+ head
46
+ header
47
+ hr
48
+ html
49
+ i
50
+ iframe
51
+ img
52
+ input
53
+ ins
54
+ kbd
55
+ label
56
+ legend
57
+ li
58
+ link
59
+ main
60
+ map
61
+ mark
62
+ meta
63
+ meter
64
+ nav
65
+ noscript
66
+ object
67
+ ol
68
+ optgroup
69
+ option
70
+ output
71
+ p
72
+ param
73
+ picture
74
+ pre
75
+ progress
76
+ q
77
+ rp
78
+ rt
79
+ ruby
80
+ s
81
+ samp
82
+ script
83
+ section
84
+ select
85
+ small
86
+ source
87
+ span
88
+ strong
89
+ style
90
+ sub
91
+ summary
92
+ sup
93
+ table
94
+ tbody
95
+ td
96
+ template
97
+ textarea
98
+ tfoot
99
+ th
100
+ thead
101
+ time
102
+ title
103
+ tr
104
+ track
105
+ u
106
+ ul
107
+ var
108
+ video
109
+ wbr
src/utils/gpt35_summaries/summarizer.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import sys
4
+ from typing import Optional
5
+
6
+ import requests
7
+ import tiktoken
8
+
9
+
10
+ class Summarizer:
11
+ def __init__(self, **kwargs):
12
+ self.openai_endpoint = "https://api.openai.com/v1/chat/completions"
13
+
14
+ # Prompt template
15
+ self.prompt_template = self._get_prompt_template()
16
+
17
+ # Type of model to use
18
+ self.model = kwargs.get("model", "gpt-3.5-turbo")
19
+
20
+ # Model hyperparameters
21
+ self.max_tokens = kwargs.get("max_tokens", 4096)
22
+ self.result_tokens = kwargs.get("result_tokens", 300)
23
+
24
+ # Model encoding
25
+ self.model_encoding = self._get_model_encoding()
26
+
27
+ # Token length of the prompt template
28
+ self.prompt_token_length = self._get_number_of_tokens(
29
+ self.prompt_template
30
+ )
31
+
32
+ def _get_prompt_template(self, search_string=None) -> str:
33
+ # Defining the template to use
34
+ template_text = """
35
+ Create a concise, clear, and in-depth summary of the following online
36
+ article. Adhere to the following guidelines:
37
+
38
+ 1. Sound professional, detached and avoid emotionally charged language.
39
+ 2. Make sure to describe who is discussed in the article, what are
40
+ the events or concepts, when things happened, and, if this information is
41
+ available, why.
42
+ 3. The summary should be between one and three paragraphs.
43
+ """
44
+ if search_string:
45
+ template_text += f"""
46
+ 4. Make sure to include and emphasize any information in the article that
47
+ relates to the following search string:
48
+ "{search_string}"
49
+ """
50
+
51
+ return template_text
52
+
53
+ def _get_model_encoding(self):
54
+ return tiktoken.encoding_for_model(self.model)
55
+
56
+ def _get_number_of_tokens(self, input_text: str) -> int:
57
+ """
58
+ Method for determining the number of tokens of the input text.
59
+
60
+ Parameters
61
+ -----------
62
+ input_text : str
63
+ Text to use for calculating its token length.
64
+
65
+ Returns
66
+ ---------
67
+ text_token_length : int
68
+ Lenght of the tokens of the input text.
69
+ """
70
+
71
+ return len(self.model_encoding.encode(input_text))
72
+
73
+ def _run_model(
74
+ self,
75
+ user_content: str,
76
+ search_string: Optional[str] = None,
77
+ temperature: Optional[float] = 1,
78
+ ):
79
+ """
80
+ Method for running the model that will create the summary for a given
81
+ observation.
82
+
83
+ Parameters
84
+ ------------
85
+ user_content : str
86
+ Content by the user that will be sent to the model via its API.
87
+
88
+ temperature : float, optional
89
+ Amount of ``temperature`` to give to the model. This parameter
90
+ handles the amount of creativity that the model can have when
91
+ creating the output response. This variable is set to ``1`` by
92
+ default.
93
+
94
+ Returns
95
+ ----------
96
+ """
97
+ # Creating the headers
98
+ headers = {
99
+ "Content-Type": "application/json",
100
+ "Authorization": f'Bearer {os.environ["OPENAI_API_KEY"]}',
101
+ }
102
+ # Composing the input messages
103
+ messages = [
104
+ {
105
+ "role": "system",
106
+ "content": self._get_prompt_template(search_string),
107
+ },
108
+ {"role": "user", "content": user_content},
109
+ ]
110
+ # Parsing the request data
111
+ request_data = {
112
+ "model": self.model,
113
+ "messages": messages,
114
+ "temperature": temperature,
115
+ }
116
+ # Extracting the response from the model's API
117
+ response = requests.post(
118
+ self.openai_endpoint,
119
+ headers=headers,
120
+ data=json.dumps(request_data),
121
+ timeout=60,
122
+ )
123
+
124
+ # Checkig if the response was OK
125
+ if response.status_code == 200:
126
+ return response.json()["choices"][0]["message"]["content"]
127
+ else:
128
+ raise RuntimeError(
129
+ f"HTTP request failed {response.status_code}, {response.text}"
130
+ )
131
+
132
+ def summarize(self, title, content, search_string=None):
133
+ content_for_summary = f"{title}\n\n{content}"
134
+ prompt_token_length = (
135
+ self.prompt_token_length
136
+ if search_string
137
+ else self._get_number_of_tokens(
138
+ self._get_prompt_template(search_string)
139
+ )
140
+ )
141
+ data_token_length = self._get_number_of_tokens(content_for_summary)
142
+ while data_token_length + prompt_token_length > self.max_tokens - 10:
143
+ print("Decimating the content.")
144
+ content = content.split()
145
+ del content[::10]
146
+ content = " ".join(content)
147
+ content_for_summary = f"{title}\n\n{content}"
148
+ data_token_length = self._get_number_of_tokens(content_for_summary)
149
+
150
+ while True:
151
+ try:
152
+ return self._run_model(
153
+ user_content=content_for_summary,
154
+ search_string=search_string,
155
+ )
156
+ except Exception as e:
157
+ print(e, file=sys.stderr)
template.envrc ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -------------------- Defining default environment ---------------------------
2
+
3
+ # --- Docker BuildKit
4
+ export DOCKER_BUILDKIT_VALUE=1
5
+
6
+ # --- Project variables
7
+ export INPUT_APP_PORT=8501
8
+ export OUTPUT_APP_PORT=8501
9
+ export APP_SERVER_PORT=7860
10
+
11
+ export HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
12
+ export HUGGING_FACE_USERNAME=${HUGGING_FACE_USERNAME}
13
+
14
+ export PATH="${PWD}:${PATH}"
15
+ export PYTHONPATH="${PWD}:${PYTHONPATH}"