Spaces:
Sleeping
Sleeping
Adding files from Github repository.
Browse files- .envrc +15 -0
- .gitattributes +3 -0
- .github/workflows/code-linting.yml +22 -0
- .gitignore +168 -0
- .isort.cfg +5 -0
- .pre-commit-config.yaml +57 -0
- .python-version +1 -0
- .tmuxgo +32 -0
- Dockerfile +104 -0
- LICENSE +21 -0
- LICENSE.rst +21 -0
- Makefile +485 -0
- README.md +282 -11
- data/cicero_faiss_index.faiss +3 -0
- data/clean_dataset.csv +3 -0
- data/raw_dataset.csv +3 -0
- docker/aliases.sh +20 -0
- docker/docker-compose.yaml +71 -0
- pyproject.toml +34 -0
- requirements-deploy.txt +1 -0
- requirements-dev.txt +12 -0
- requirements.txt +14 -0
- src/.DS_Store +0 -0
- src/api/__init__.py +21 -0
- src/api/index.py +182 -0
- src/app_service/__init__.py +21 -0
- src/app_service/app.py +167 -0
- src/classes/__init__.py +21 -0
- src/classes/__pycache__/__init__.cpython-39.pyc +0 -0
- src/classes/__pycache__/hugging_face_utils.cpython-39.pyc +0 -0
- src/classes/__pycache__/semantic_search_engine.cpython-39.pyc +0 -0
- src/classes/data_preparation.py +403 -0
- src/classes/hugging_face_utils.py +223 -0
- src/classes/semantic_search_engine.py +249 -0
- src/data_processing/__init__.py +21 -0
- src/data_processing/prepare_dataset.py +196 -0
- src/focused_summary_example.py +20 -0
- src/training/__init__.py +21 -0
- src/training/create_faiss_corpus_index.py +209 -0
- src/utils/.DS_Store +0 -0
- src/utils/__init__.py +21 -0
- src/utils/__pycache__/__init__.cpython-39.pyc +0 -0
- src/utils/__pycache__/default_variables.cpython-39.pyc +0 -0
- src/utils/default_variables.py +76 -0
- src/utils/general_utilities.py +181 -0
- src/utils/gpt35_summaries/__init__.py +0 -0
- src/utils/gpt35_summaries/cleanup_and_summarize.py +107 -0
- src/utils/gpt35_summaries/html_tags.txt +109 -0
- src/utils/gpt35_summaries/summarizer.py +157 -0
- template.envrc +15 -0
.envrc
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -------------------- Defining default environment ---------------------------
|
2 |
+
|
3 |
+
# --- Docker BuildKit
|
4 |
+
export DOCKER_BUILDKIT_VALUE=1
|
5 |
+
|
6 |
+
# --- Project variables
|
7 |
+
export INPUT_APP_PORT=8501
|
8 |
+
export OUTPUT_APP_PORT=8501
|
9 |
+
export APP_SERVER_PORT=7860
|
10 |
+
|
11 |
+
export HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
|
12 |
+
export HUGGING_FACE_USERNAME=${HUGGING_FACE_USERNAME}
|
13 |
+
|
14 |
+
export PATH="${PWD}:${PATH}"
|
15 |
+
export PYTHONPATH="${PWD}:${PYTHONPATH}"
|
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/cicero_faiss_index.faiss filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/clean_dataset.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data/raw_dataset.csv filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/code-linting.yml
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Project CICD
|
2 |
+
run-name: ${{ github.actor }} - CICD
|
3 |
+
on: [push]
|
4 |
+
|
5 |
+
jobs:
|
6 |
+
#
|
7 |
+
# --- Code-linting
|
8 |
+
lint-code:
|
9 |
+
runs-on: ubuntu-latest
|
10 |
+
steps:
|
11 |
+
# Checkout repository
|
12 |
+
- uses: actions/checkout@v3
|
13 |
+
# Install Python
|
14 |
+
- uses: actions/setup-python@v4
|
15 |
+
with:
|
16 |
+
python-version: "3.9"
|
17 |
+
# Install python dependencies
|
18 |
+
- name: Install dependencies
|
19 |
+
run: |
|
20 |
+
make requirements
|
21 |
+
make pre-commit-install
|
22 |
+
make lint
|
.gitignore
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
161 |
+
|
162 |
+
.envrc
|
163 |
+
.python-version
|
164 |
+
data/
|
165 |
+
|
166 |
+
src/utils/gpt35_summaries/df_embed.csv
|
167 |
+
src/utils/gpt35_summaries/df_embed_out2.csv
|
168 |
+
src/utils/gpt35_summaries/words_alpha.txt
|
.isort.cfg
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[settings]
|
2 |
+
line_length = 79
|
3 |
+
multi_line_output = 3
|
4 |
+
include_trailing_comma = True
|
5 |
+
known_third_party =datasets,fastapi,gradio,huggingface_hub,numpy,pandas,pydantic,requests,sentence_transformers,spacy,tiktoken,torch,utils
|
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Created : 2023-03-01
|
2 |
+
# Last Modified : 2023-03-04
|
3 |
+
#
|
4 |
+
# Description
|
5 |
+
# This file summarizes the set of checks that pre-commit will perform
|
6 |
+
# prior to any commit.
|
7 |
+
|
8 |
+
default_stages: [commit, manual]
|
9 |
+
|
10 |
+
# Repositories to use
|
11 |
+
repos:
|
12 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
13 |
+
rev: v3.4.0
|
14 |
+
hooks:
|
15 |
+
- id: trailing-whitespace
|
16 |
+
- id: end-of-file-fixer
|
17 |
+
- id: check-yaml # Checks yaml files for parseable syntax.
|
18 |
+
- id: check-json # Checks json files for parseable syntax.
|
19 |
+
- id: check-added-large-files
|
20 |
+
- id: check-toml
|
21 |
+
- id: check-docstring-first
|
22 |
+
- id: check-case-conflict # Check for files that would conflict in case-insensitive filesystems
|
23 |
+
- id: check-merge-conflict # Check for files that contain merge conflict strings.
|
24 |
+
- id: debug-statements # Check for debugger imports and py37+ `breakpoint()` calls in python source.
|
25 |
+
- repo: https://github.com/pycqa/flake8
|
26 |
+
rev: 5.0.4
|
27 |
+
hooks:
|
28 |
+
- id: flake8
|
29 |
+
language_version: python3.9
|
30 |
+
exclude: >
|
31 |
+
(?x)^(
|
32 |
+
src/focused_summary_example.py
|
33 |
+
)
|
34 |
+
|
35 |
+
- repo: https://github.com/ambv/black
|
36 |
+
rev: 22.3.0
|
37 |
+
hooks:
|
38 |
+
- id: black
|
39 |
+
language_version: python3.9
|
40 |
+
|
41 |
+
- repo: https://github.com/asottile/seed-isort-config
|
42 |
+
rev: v2.2.0
|
43 |
+
hooks:
|
44 |
+
- id: seed-isort-config
|
45 |
+
|
46 |
+
- repo: https://github.com/pycqa/isort
|
47 |
+
rev: 5.11.5
|
48 |
+
hooks:
|
49 |
+
- id: isort
|
50 |
+
name: isort (python)
|
51 |
+
exclude: hooks.py
|
52 |
+
- id: isort
|
53 |
+
name: isort (cython)
|
54 |
+
types: [cython]
|
55 |
+
- id: isort
|
56 |
+
name: isort (pyi)
|
57 |
+
types: [pyi]
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Cicero_LLM_Synthesizer
|
.tmuxgo
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
SESSION=`basename $PWD`
|
3 |
+
ENVNAME='ml'
|
4 |
+
|
5 |
+
# Creating new session
|
6 |
+
tmux -2 new-session -d -s $SESSION `cd $PWD`
|
7 |
+
|
8 |
+
# ------ Main window ------
|
9 |
+
# Renaming window
|
10 |
+
tmux rename-window -t $SESSION:0 main
|
11 |
+
# Splitting panes and windows
|
12 |
+
tmux split-window -v
|
13 |
+
tmux select-pane -t 0
|
14 |
+
tmux resize-pane -D 15
|
15 |
+
tmux select-pane -t 1
|
16 |
+
# Sending commands
|
17 |
+
tmux send -t $SESSION:0.1 "cd $PWD; conda activate $ENVNAME; source $SHELL ;clear; htop" ENTER
|
18 |
+
#
|
19 |
+
# ------ Miscellaneous window ------
|
20 |
+
tmux new-window -t $SESSION:1 -n 'misc'
|
21 |
+
tmux send -t $SESSION:1.0 "cd $PWD; conda activate $ENVNAME; source $SHELL ; clear;" ENTER
|
22 |
+
# ------ Extras window ------
|
23 |
+
tmux new-window -t $SESSION:2 -n 'extras'
|
24 |
+
tmux send -t $SESSION:2.0 "cd $PWD; conda activate $ENVNAME; source $SHELL ; clear;" ENTER
|
25 |
+
# ------ Jupyter window ------
|
26 |
+
tmux new-window -t $SESSION:3 -n 'jupyter'
|
27 |
+
tmux send -t $SESSION:3.0 "cd $PWD; conda activate $ENVNAME; source $SHELL ; clear;" ENTER
|
28 |
+
#
|
29 |
+
# Selecting which window to start at
|
30 |
+
tmux select-window -t $SESSION:0
|
31 |
+
tmux select-pane -t 0
|
32 |
+
tmux -2 attach -t $SESSION
|
Dockerfile
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ARG PYTHON_VERSION="3.9.13"
|
2 |
+
ARG PLATFORM_NAME="linux/amd64"
|
3 |
+
|
4 |
+
FROM --platform=${PLATFORM_NAME} python:${PYTHON_VERSION}
|
5 |
+
|
6 |
+
# --- SYSTEM ARCHITECTURE
|
7 |
+
ARG TARGETPLATFORM
|
8 |
+
ARG TARGETARCH
|
9 |
+
ARG TARGETVARIANT
|
10 |
+
|
11 |
+
RUN printf "I'm building for TARGETPLATFORM=${TARGETPLATFORM}" \
|
12 |
+
&& printf ", TARGETARCH=${TARGETARCH}" \
|
13 |
+
&& printf ", TARGETVARIANT=${TARGETVARIANT} \n" \
|
14 |
+
&& printf "With uname -s : " && uname -s \
|
15 |
+
&& printf "and uname -m : " && uname -mm
|
16 |
+
|
17 |
+
# --- Environment variables
|
18 |
+
ENV REQUIREMENTS_FILE="requirements.txt"
|
19 |
+
ENV OUTDIR="/root"
|
20 |
+
ENV PROJECT_DIR="/opt/ml"
|
21 |
+
ENV PROGRAM_DIR="/opt/program"
|
22 |
+
ENV HOME_DIR="/root/ml"
|
23 |
+
ENV LOCAL_DEV_DIR="docker"
|
24 |
+
ENV ALIASES_FILE="/root/aliases.sh"
|
25 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
26 |
+
|
27 |
+
# --- Dockerfile Metadata
|
28 |
+
LABEL Maintainer="Victor Calderon"
|
29 |
+
|
30 |
+
# ------------------------- COPYING AND DIRECTORIES ---------------------------
|
31 |
+
|
32 |
+
RUN mkdir -p ${HOME_DIR}
|
33 |
+
|
34 |
+
COPY ./src ${PROJECT_DIR}/src
|
35 |
+
COPY ${LOCAL_DEV_DIR}/aliases.sh ${ALIASES_FILE}
|
36 |
+
|
37 |
+
COPY ${REQUIREMENTS_FILE} "${HOME_DIR}/${REQUIREMENTS_FILE}"
|
38 |
+
|
39 |
+
# ---------------------- EXPOSING PORTS FOR APP -------------------------------
|
40 |
+
|
41 |
+
EXPOSE 7860
|
42 |
+
EXPOSE 8501
|
43 |
+
|
44 |
+
# --------------------- INSTALLING EXTRA PACKAGES -----------------------------
|
45 |
+
# --- Updating packages and installing packages at the system-level
|
46 |
+
|
47 |
+
RUN apt-get -y update && \
|
48 |
+
apt-get upgrade -y && \
|
49 |
+
apt-get clean && \
|
50 |
+
# Instaling system-level packages
|
51 |
+
apt-get install -y \
|
52 |
+
git \
|
53 |
+
ssh \
|
54 |
+
tree \
|
55 |
+
git-flow \
|
56 |
+
tmux \
|
57 |
+
direnv \
|
58 |
+
bash-completion \
|
59 |
+
zsh \
|
60 |
+
htop \
|
61 |
+
vim \
|
62 |
+
&& \
|
63 |
+
# Cleaning out
|
64 |
+
rm -rf /var/lib/apt/lists/* && \
|
65 |
+
# Cleaning installs
|
66 |
+
apt-get clean && \
|
67 |
+
# Installing ZSH and OhZSH
|
68 |
+
sh -c "$(curl -fsSL https://raw.github.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" && \
|
69 |
+
echo "source /etc/profile.d/bash_completion.sh" >> /root/.bashrc && \
|
70 |
+
echo "source /etc/profile.d/bash_completion.sh" >> /root/.zshrc && \
|
71 |
+
echo "source /root/aliases.sh" >> "${OUTDIR}/.zshrc" && \
|
72 |
+
echo "source /root/aliases.sh" >> "${OUTDIR}/.bashrc" && \
|
73 |
+
# Install direnv
|
74 |
+
echo 'eval "$(direnv hook zsh)"' >> "${OUTDIR}/.zshrc" && \
|
75 |
+
echo 'eval "$(direnv hook bash)"' >> "${OUTDIR}/.bash"
|
76 |
+
|
77 |
+
# -------------------------- DOCKER-SPECIFIC ----------------------------------
|
78 |
+
|
79 |
+
RUN apt-get update -y && \
|
80 |
+
cd ${OUTDIR_DOCKER} && \
|
81 |
+
curl -fsSL https://get.docker.com -o get-docker.sh && sh get-docker.sh
|
82 |
+
|
83 |
+
# --------------------------- PYTHON-RELATED-LOCAL ----------------------------
|
84 |
+
|
85 |
+
RUN pip install --upgrade pip && \
|
86 |
+
python -m pip install -r "${HOME_DIR}/${REQUIREMENTS_FILE}"
|
87 |
+
|
88 |
+
# ----------------------------- PYTHON-SPECIFIC -------------------------------
|
89 |
+
|
90 |
+
# Set some environment variables. PYTHONUNBUFFERED keeps Python from
|
91 |
+
# buffering our standard output stream, which means that logs can be
|
92 |
+
# delivered to the user quickly. PYTHONDONTWRITEBYTECODE keeps Python
|
93 |
+
# from writing the .pyc files which are unnecessary in this case. We also
|
94 |
+
# update PATH so that the train and serve programs are found when the
|
95 |
+
# container is invoked.
|
96 |
+
|
97 |
+
ENV PYTHONUNBUFFERED=TRUE
|
98 |
+
ENV PYTHONDONTWRITEBYTECODE=TRUE
|
99 |
+
ENV PATH="${PROGRAM_DIR}:${PATH}"
|
100 |
+
ENV PYTHONPATH="${PROGRAM_DIR}:${PYTHONPATH}"
|
101 |
+
|
102 |
+
WORKDIR ${PROJECT_DIR}
|
103 |
+
|
104 |
+
CMD ["uvicorn", "src.api.index:app", "--host", "0.0.0.0","--port", "7860"]
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Maven-Building-LLMS-into-Production
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
LICENSE.rst
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Victor Calderon
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in
|
13 |
+
all copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
Makefile
ADDED
@@ -0,0 +1,485 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: show-params
|
2 |
+
|
3 |
+
###############################################################################
|
4 |
+
# GLOBALS #
|
5 |
+
###############################################################################
|
6 |
+
|
7 |
+
PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
|
8 |
+
PROJECT_NAME := $(shell basename $(subst -,_,$(PROJECT_DIR)))
|
9 |
+
PROJECT_NAME_LOWER := $(shell echo $(PROJECT_NAME) | tr '[:upper:]' '[:lower:]')
|
10 |
+
ENVIRONMENT_NAME = $(PROJECT_NAME)
|
11 |
+
PYTHON_INTERPRETER = python3
|
12 |
+
PIP_INTERPRETER = pip
|
13 |
+
PYTHON_VERSION = 3.9
|
14 |
+
PIP_VERSION = 22.3
|
15 |
+
|
16 |
+
# --- REQUIREMENTS-RELATED
|
17 |
+
REQUIREMENTS_FILE = $(PROJECT_DIR)/requirements.txt
|
18 |
+
REQUIREMENTS_FILE_TEMP = $(PROJECT_DIR)/requirements.tmp
|
19 |
+
REQUIREMENTS_DEV_FILE = $(PROJECT_DIR)/requirements-dev.txt
|
20 |
+
REQUIREMENTS_DEV_FILE_TEMP = $(PROJECT_DIR)/requirements-dev.tmp
|
21 |
+
REQUIREMENTS_DEPLOYMENT_FILE = $(PROJECT_DIR)/requirements-deploy.txt
|
22 |
+
REQUIREMENTS_DEPLOYMENT_FILE_TEMP = $(PROJECT_DIR)/requirements-deploy.tmp
|
23 |
+
|
24 |
+
# --- PATHS TO PROJECT DIRECTOIRES
|
25 |
+
DATA_DIRECTORY = $(PROJECT_DIR)/data
|
26 |
+
SRC_DIRECTORY = $(PROJECT_DIR)/src
|
27 |
+
API_DIRECTORY = $(SRC_DIRECTORY)/api
|
28 |
+
DATA_PROCESSING_DIRECTORY = $(SRC_DIRECTORY)/data_processing
|
29 |
+
TRAINING_DIRECTORY = $(SRC_DIRECTORY)/training
|
30 |
+
|
31 |
+
# -- Docker-related
|
32 |
+
# Variable used for turning on/off Docker Buildkit
|
33 |
+
DOCKER_BUILDKIT_VALUE=1
|
34 |
+
LOCAL_DEVELOPMENT_DIR_PATH="$(PROJECT_DIR)/docker"
|
35 |
+
|
36 |
+
# -- API-related
|
37 |
+
INPUT_APP_PORT=8501
|
38 |
+
OUTPUT_APP_PORT=8501
|
39 |
+
API_WEBSERVER_URL="http://localhost:$(INPUT_APP_PORT)"
|
40 |
+
|
41 |
+
# -- App-related
|
42 |
+
APP_SERVER_PORT=7860
|
43 |
+
APP_WEBSERVER_URL="http://localhost:$(APP_SERVER_PORT)"
|
44 |
+
|
45 |
+
# ----------------------------- Python-specific -------------------------------
|
46 |
+
# - Checking what type of python one is using
|
47 |
+
# Anaconda
|
48 |
+
ifeq (,$(shell which conda))
|
49 |
+
HAS_CONDA=False
|
50 |
+
else
|
51 |
+
HAS_CONDA=True
|
52 |
+
# We need to specify the following commands in order to properly activate the
|
53 |
+
# Anaconda environment.
|
54 |
+
SHELL=/bin/bash
|
55 |
+
# Note that the extra activate is needed to ensure that the activate floats env to the front of PATH
|
56 |
+
CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
|
57 |
+
CONDA_DEACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda deactivate ; conda deactivate
|
58 |
+
endif
|
59 |
+
|
60 |
+
# - Pyenv
|
61 |
+
ifeq (,$(shell which pyenv))
|
62 |
+
HAS_PYENV=False
|
63 |
+
else
|
64 |
+
HAS_PYENV=True
|
65 |
+
endif
|
66 |
+
|
67 |
+
###############################################################################
|
68 |
+
# VARIABLES FOR COMMANDS #
|
69 |
+
###############################################################################
|
70 |
+
|
71 |
+
## Show the set of input parameters
|
72 |
+
show-params:
|
73 |
+
@ printf "\n-------- GENERAL ---------------\n"
|
74 |
+
@ echo "PROJECT_DIR: $(PROJECT_DIR)"
|
75 |
+
@ echo "PROJECT_NAME: $(PROJECT_NAME)"
|
76 |
+
@ echo "LOCAL_DEVELOPMENT_DIR_PATH: $(LOCAL_DEVELOPMENT_DIR_PATH)"
|
77 |
+
@ echo "ENVIRONMENT_NAME: $(ENVIRONMENT_NAME)"
|
78 |
+
@ echo "PYTHON_INTERPRETER: $(PYTHON_INTERPRETER)"
|
79 |
+
@ echo "PYTHON_VERSION: $(PYTHON_VERSION)"
|
80 |
+
@ echo "PIP_VERSION: $(PIP_VERSION)"
|
81 |
+
@ echo "REQUIREMENTS_FILE: $(REQUIREMENTS_FILE)"
|
82 |
+
@ echo "REQUIREMENTS_FILE_TEMP: $(REQUIREMENTS_FILE_TEMP)"
|
83 |
+
@ echo "REQUIREMENTS_DEV_FILE: $(REQUIREMENTS_DEV_FILE)"
|
84 |
+
@ echo "REQUIREMENTS_DEV_FILE_TEMP: $(REQUIREMENTS_DEV_FILE_TEMP)"
|
85 |
+
@ echo "REQUIREMENTS_DEPLOYMENT_FILE: $(REQUIREMENTS_DEPLOYMENT_FILE)"
|
86 |
+
@ echo "REQUIREMENTS_DEPLOYMENT_FILE_TEMP: $(REQUIREMENTS_DEPLOYMENT_FILE_TEMP)"
|
87 |
+
@ printf "\n-------- DOCKER ---------------\n"
|
88 |
+
@ echo "DOCKER_BUILDKIT_VALUE: $(DOCKER_BUILDKIT_VALUE)"
|
89 |
+
@ printf "\n-------- PYTHON ---------------\n"
|
90 |
+
@ echo "HAS_CONDA: $(HAS_CONDA)"
|
91 |
+
@ echo "HAS_PYENV: $(HAS_PYENV)"
|
92 |
+
@ printf "\n-------- LOCAL DEVELOPMENT ---------------\n"
|
93 |
+
@ echo "LOCAL_DEV_DOCKER_PROJECT_NAME: $(LOCAL_DEV_DOCKER_PROJECT_NAME)"
|
94 |
+
@ echo "LOCAL_DEV_SERVICE_NAME: $(LOCAL_DEV_SERVICE_NAME)"
|
95 |
+
@ printf "\n-------- API ---------------\n"
|
96 |
+
@ echo "APP_PORT: $(APP_PORT)"
|
97 |
+
@ echo "APP_WEBSERVER_URL: $(APP_WEBSERVER_URL)"
|
98 |
+
@ echo "API_SERVICE_NAME: $(API_SERVICE_NAME)"
|
99 |
+
@ echo "API_DOCKER_PROJECT_NAME: $(API_DOCKER_PROJECT_NAME)"
|
100 |
+
@ printf "\n-----------------------\n"
|
101 |
+
|
102 |
+
## Initialize the repository for code development
|
103 |
+
init: clean create-envrc delete-environment create-environment
|
104 |
+
ifeq (True,$(HAS_CONDA))
|
105 |
+
@ ($(CONDA_ACTIVATE) $(ENVIRONMENT_NAME) ; $(MAKE) requirements)
|
106 |
+
@ printf "\n\n>>> New Conda environment created. Activate with: \n\t: conda activate $(ENVIRONMENT_NAME)"
|
107 |
+
@ $(MAKE) show-params
|
108 |
+
@ printf "\n\n>>> Project initialized!"
|
109 |
+
@ ($(CONDA_ACTIVATE) $(ENVIRONMENT_NAME) ; $(MAKE) pre-commit-install )
|
110 |
+
@ ($(CONDA_ACTIVATE) $(ENVIRONMENT_NAME) ; $(MAKE) lint )
|
111 |
+
else
|
112 |
+
@ direnv allow || echo ""
|
113 |
+
@ echo ">>> Continuing installation ..."
|
114 |
+
@ $(MAKE) requirements
|
115 |
+
@ $(MAKE) show-params
|
116 |
+
@ printf "\n\n>>> Project initialized!\n"
|
117 |
+
@ $(MAKE) pre-commit-install
|
118 |
+
@ $(MAKE) lint
|
119 |
+
endif
|
120 |
+
|
121 |
+
## Remove ALL of the artifacts + Python environments
|
122 |
+
destroy: clean pre-commit-uninstall delete-environment
|
123 |
+
@ echo ">>> Deleted all artifacts and environments!"
|
124 |
+
|
125 |
+
###############################################################################
|
126 |
+
# MISCELLANEOUS COMMANDS #
|
127 |
+
###############################################################################
|
128 |
+
|
129 |
+
# -------------------- Functions for cleaning repository ----------------------
|
130 |
+
|
131 |
+
## Removes artifacts from the build stage, and other common Python artifacts.
|
132 |
+
clean: clean-build clean-pyc clean-test clean-secrets clean-model-files clean-images
|
133 |
+
|
134 |
+
## Removes Python file artifacts
|
135 |
+
clean-pyc:
|
136 |
+
find . -name '*.pyc' -exec rm -f {} +
|
137 |
+
find . -name '*.pyo' -exec rm -f {} +
|
138 |
+
find . -name '*~' -exec rm -f {} +
|
139 |
+
find . -name '__pycache__' -exec rm -fr {} +
|
140 |
+
|
141 |
+
## Remove build artifacts
|
142 |
+
clean-build:
|
143 |
+
rm -fr build/
|
144 |
+
rm -fr dist/
|
145 |
+
rm -fr .eggs/
|
146 |
+
find . -name '*.egg-info' -exec rm -fr {} +
|
147 |
+
find . -name '*.egg' -exec rm -f {} +
|
148 |
+
|
149 |
+
## Remove test and coverage artifacts
|
150 |
+
clean-test:
|
151 |
+
rm -fr .tox/
|
152 |
+
rm -f .coverage
|
153 |
+
rm -fr htmlcov/
|
154 |
+
rm -fr .pytest_cache
|
155 |
+
|
156 |
+
## Remove files related to pre-trained models
|
157 |
+
clean-model-files:
|
158 |
+
find . -name '*.pt' -exec rm -fr {} +
|
159 |
+
find . -name "runs" -type d -exec rm -rf {} + || echo ""
|
160 |
+
|
161 |
+
## Clean left-over images
|
162 |
+
clean-images:
|
163 |
+
find . -name '*.png' -exec rm -fr {} +
|
164 |
+
find . -name '*.jpg' -exec rm -fr {} +
|
165 |
+
|
166 |
+
## Removes secret artifacts - Serverless
|
167 |
+
clean-secrets:
|
168 |
+
find . -name "node_modules" -type d -exec rm -rf {} + || echo ""
|
169 |
+
find . -name ".serverless" -type d -exec rm -rf {} + || echo ""
|
170 |
+
|
171 |
+
# ---------------------- Functions for local environment ----------------------
|
172 |
+
|
173 |
+
## Set up the envrc file for the project.
|
174 |
+
create-envrc:
|
175 |
+
@ echo "cat $(PROJECT_DIR)/template.envrc > $(PROJECT_DIR)/.envrc"
|
176 |
+
@ cat $(PROJECT_DIR)/template.envrc > $(PROJECT_DIR)/.envrc
|
177 |
+
|
178 |
+
## Delete the local envrc file of the project
|
179 |
+
delete-envrc:
|
180 |
+
@ rm -rf $(PROJECT_DIR)/.envrc || echo ""
|
181 |
+
|
182 |
+
## Install git-flow
|
183 |
+
git-flow-install:
|
184 |
+
@ (( if [[ ! -f "`which git-flow`" ]]; then \
|
185 |
+
echo "No Git-flow installed"! ; \
|
186 |
+
if [[ -f "`which brew`" ]]; then \
|
187 |
+
echo "Homebrew installed"; \
|
188 |
+
HOMEBREW_NO_AUTO_UPDATE=1 brew install git-flow; \
|
189 |
+
elif [[ -f "`which apt-get`" ]]; then \
|
190 |
+
echo "Apt-get installed"; \
|
191 |
+
apt-get install git-flow; \
|
192 |
+
else \
|
193 |
+
echo "Could not locate package manager! (brew or apt-get)"; \
|
194 |
+
fi; \
|
195 |
+
fi ) && git flow init -f -d) || echo "Git-Flow setup could not be completed"
|
196 |
+
|
197 |
+
|
198 |
+
# ---------------------- Functions for Python environment ---------------------
|
199 |
+
|
200 |
+
## Creates the Python environment
|
201 |
+
create-environment:
|
202 |
+
ifeq (True,$(HAS_CONDA))
|
203 |
+
@ echo ">>> Detected CONDA ... Creating new conda environment!"
|
204 |
+
@ echo ">>> \tCreating environment: \t $(ENVIRONMENT_NAME)"
|
205 |
+
@ conda create --name $(ENVIRONMENT_NAME) python=$(PYTHON_VERSION) -y || echo ""
|
206 |
+
@ echo ">>> New conda environment created. Activate with: \n conda activate $(ENVIRONMENT_NAME)"
|
207 |
+
else ifeq (True,$(HAS_PYENV))
|
208 |
+
@ echo ">>> Detected PYENV ... Creating new Pyenv environment!"
|
209 |
+
@ echo ">>> \tCreating environment: \t $(ENVIRONMENT_NAME)"
|
210 |
+
@ pyenv virtualenv $(PYTHON_VERSION) $(ENVIRONMENT_NAME) || echo ""
|
211 |
+
@ pyenv local $(ENVIRONMENT_NAME)
|
212 |
+
@ echo ">>> New Pyenv environment created: '$(ENVIRONMENT_NAME)'"
|
213 |
+
@ pyenv virtualenvs
|
214 |
+
@ echo
|
215 |
+
endif
|
216 |
+
|
217 |
+
## Deletes the Python environment
|
218 |
+
delete-environment:
|
219 |
+
ifeq (True,$(HAS_CONDA))
|
220 |
+
@ echo ">>> Detected CONDA ... Deleting Conda environment, if applicable!"
|
221 |
+
@ echo ">>> Deleting environment: '$(ENVIRONMENT_NAME)'"
|
222 |
+
@ ($(CONDA_DEACTIVATE) ; conda env remove --name $(ENVIRONMENT_NAME) -y) || echo ""
|
223 |
+
@ echo ">>> Conda environment deleted: '$(ENVIRONMENT_NAME)'"
|
224 |
+
else ifeq (True,$(HAS_PYENV))
|
225 |
+
@ echo ">>> Detected PYENV ... Deleting Pyenv environment!"
|
226 |
+
@ echo ">>> Deleting environment: '$(ENVIRONMENT_NAME)'"
|
227 |
+
@ pyenv uninstall -f $(ENVIRONMENT_NAME) || echo ""
|
228 |
+
@ rm $(PROJECT_DIR)/.python-version || echo ""
|
229 |
+
@ echo ">>> Pyenv environment deleted: '$(ENVIRONMENT_NAME)'"
|
230 |
+
@ pyenv virtualenvs
|
231 |
+
@ echo
|
232 |
+
endif
|
233 |
+
|
234 |
+
## Upgrade the version of the 'pip' package
|
235 |
+
pip-upgrade:
|
236 |
+
@ $(PYTHON_INTERPRETER) -m pip install --no-cache-dir -q --upgrade pip==$(PIP_VERSION)
|
237 |
+
|
238 |
+
## Sort the project packages requirements file
|
239 |
+
sort-requirements:
|
240 |
+
@ sort $(REQUIREMENTS_FILE) | grep "\S" > $(REQUIREMENTS_FILE_TEMP) && \
|
241 |
+
mv $(REQUIREMENTS_FILE_TEMP) $(REQUIREMENTS_FILE)
|
242 |
+
@ sort $(REQUIREMENTS_DEV_FILE) | grep "\S" > $(REQUIREMENTS_DEV_FILE_TEMP) && \
|
243 |
+
mv $(REQUIREMENTS_DEV_FILE_TEMP) $(REQUIREMENTS_DEV_FILE)
|
244 |
+
@ sort $(REQUIREMENTS_DEPLOYMENT_FILE) | grep "\S" > $(REQUIREMENTS_DEPLOYMENT_FILE_TEMP) && \
|
245 |
+
mv $(REQUIREMENTS_DEPLOYMENT_FILE_TEMP) $(REQUIREMENTS_DEPLOYMENT_FILE)
|
246 |
+
|
247 |
+
|
248 |
+
## Install Python dependencies into the Python environment
|
249 |
+
requirements: pip-upgrade sort-requirements
|
250 |
+
@ $(PYTHON_INTERPRETER) -m pip install --no-cache-dir -q -r $(REQUIREMENTS_DEV_FILE)
|
251 |
+
|
252 |
+
# -------------------------- Functions for Code Linting -----------------------
|
253 |
+
|
254 |
+
## Installing the pre-commit Git hook
|
255 |
+
pre-commit-install:
|
256 |
+
@ pre-commit install
|
257 |
+
|
258 |
+
## Uninstall the pre-commit Git hook
|
259 |
+
pre-commit-uninstall:
|
260 |
+
@ pre-commit uninstall
|
261 |
+
|
262 |
+
## Run the 'pre-commit' linting step manually
|
263 |
+
lint:
|
264 |
+
@ pre-commit run -a --hook-stage manual
|
265 |
+
|
266 |
+
|
267 |
+
###############################################################################
|
268 |
+
# Docker Commands - Local development #
|
269 |
+
###############################################################################
|
270 |
+
|
271 |
+
LOCAL_DEV_DOCKER_PROJECT_NAME="$(PROJECT_NAME_LOWER)_localdev_dind"
|
272 |
+
LOCAL_DEV_SERVICE_NAME="local-dev"
|
273 |
+
|
274 |
+
## Clean Docker images
|
275 |
+
docker-prune:
|
276 |
+
@ docker system prune -f
|
277 |
+
|
278 |
+
## Stops both the API service and the local development service
|
279 |
+
all-stop: api-stop docker-local-dev-stop app-stop
|
280 |
+
@ echo "All services are down"
|
281 |
+
|
282 |
+
## Starts both the API service and the local development service
|
283 |
+
all-start: api-start docker-local-dev-start app-start
|
284 |
+
@ echo "All services are up!"
|
285 |
+
|
286 |
+
## Build local development Docker image
|
287 |
+
docker-local-dev-build: docker-prune
|
288 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
289 |
+
docker compose \
|
290 |
+
--project-name $(LOCAL_DEV_DOCKER_PROJECT_NAME) \
|
291 |
+
build $(LOCAL_DEV_SERVICE_NAME)
|
292 |
+
|
293 |
+
## Start service for local development
|
294 |
+
docker-local-dev-start: docker-local-dev-build docker-local-dev-stop
|
295 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
296 |
+
docker compose \
|
297 |
+
--project-name $(LOCAL_DEV_DOCKER_PROJECT_NAME) \
|
298 |
+
up -d $(LOCAL_DEV_SERVICE_NAME)
|
299 |
+
|
300 |
+
## Stop service for local development
|
301 |
+
docker-local-dev-stop:
|
302 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
303 |
+
docker compose \
|
304 |
+
--project-name $(LOCAL_DEV_DOCKER_PROJECT_NAME) \
|
305 |
+
stop $(LOCAL_DEV_SERVICE_NAME)
|
306 |
+
@ $(MAKE) docker-prune
|
307 |
+
|
308 |
+
## Start a shell session into the docker container
|
309 |
+
docker-local-dev-login:
|
310 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
311 |
+
docker compose \
|
312 |
+
--project-name $(LOCAL_DEV_DOCKER_PROJECT_NAME) \
|
313 |
+
exec \
|
314 |
+
$(LOCAL_DEV_SERVICE_NAME) /bin/zsh
|
315 |
+
|
316 |
+
###############################################################################
|
317 |
+
# Docker Commands - API-related #
|
318 |
+
###############################################################################
|
319 |
+
|
320 |
+
API_DOCKER_PROJECT_NAME="$(PROJECT_NAME_LOWER)_api"
|
321 |
+
API_SERVICE_NAME="api"
|
322 |
+
|
323 |
+
## Build API Docker image
|
324 |
+
api-build: docker-prune
|
325 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
326 |
+
docker compose \
|
327 |
+
--project-name $(API_DOCKER_PROJECT_NAME) \
|
328 |
+
build $(API_SERVICE_NAME)
|
329 |
+
|
330 |
+
## Start API Docker image container
|
331 |
+
api-start: api-stop api-build
|
332 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
333 |
+
docker compose \
|
334 |
+
--project-name $(API_DOCKER_PROJECT_NAME) \
|
335 |
+
up -d $(API_SERVICE_NAME)
|
336 |
+
|
337 |
+
## Stop API Docker image container
|
338 |
+
api-stop:
|
339 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
340 |
+
docker compose \
|
341 |
+
--project-name $(API_DOCKER_PROJECT_NAME) \
|
342 |
+
stop $(API_SERVICE_NAME)
|
343 |
+
@ $(MAKE) docker-prune
|
344 |
+
|
345 |
+
## Open API in web browser
|
346 |
+
api-web:
|
347 |
+
@ python -m webbrowser "$(API_WEBSERVER_URL)/docs"
|
348 |
+
|
349 |
+
###############################################################################
|
350 |
+
# Docker Commands - App-related #
|
351 |
+
###############################################################################
|
352 |
+
|
353 |
+
APP_DOCKER_PROJECT_NAME="$(PROJECT_NAME_LOWER)_app"
|
354 |
+
APP_SERVICE_NAME="app"
|
355 |
+
|
356 |
+
## Build App Docker image
|
357 |
+
app-app-build: docker-prune
|
358 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
359 |
+
docker compose \
|
360 |
+
--project-name $(APP_DOCKER_PROJECT_NAME) \
|
361 |
+
build $(APP_SERVICE_NAME)
|
362 |
+
|
363 |
+
## Start App Docker image container
|
364 |
+
app-app-start: app-app-stop app-app-build
|
365 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
366 |
+
docker compose \
|
367 |
+
--project-name $(APP_DOCKER_PROJECT_NAME) \
|
368 |
+
up -d $(APP_SERVICE_NAME)
|
369 |
+
|
370 |
+
## Stop App Docker image container
|
371 |
+
app-app-stop:
|
372 |
+
@ cd $(LOCAL_DEVELOPMENT_DIR_PATH) && \
|
373 |
+
docker compose \
|
374 |
+
--project-name $(APP_DOCKER_PROJECT_NAME) \
|
375 |
+
stop $(APP_SERVICE_NAME)
|
376 |
+
@ $(MAKE) docker-prune
|
377 |
+
|
378 |
+
## Open App in web browser
|
379 |
+
app-app-web:
|
380 |
+
@ python -m webbrowser "$(APP_WEBSERVER_URL)"
|
381 |
+
|
382 |
+
###############################################################################
|
383 |
+
# Unit Tests and Code checking #
|
384 |
+
###############################################################################
|
385 |
+
|
386 |
+
# See: https://github.com/google/addlicense for more information
|
387 |
+
## Add licenses to Python files
|
388 |
+
add-licenses:
|
389 |
+
@ docker run -it \
|
390 |
+
-v ${PWD}:/src \
|
391 |
+
ghcr.io/google/addlicense \
|
392 |
+
-f ./LICENSE.rst \
|
393 |
+
./src/**/*.py
|
394 |
+
|
395 |
+
## Open up all web endpoints
|
396 |
+
all-web: api-web app-app-web
|
397 |
+
@ echo "All web endpoints opened!"
|
398 |
+
|
399 |
+
###############################################################################
|
400 |
+
# PROJECT AND DATA FUNCTIONS #
|
401 |
+
###############################################################################
|
402 |
+
|
403 |
+
DATASET_PATH="https://raw.githubusercontent.com/hamzafarooq/maven-mlsystem-design-cohort-1/main/data/df_embed.csv"
|
404 |
+
DATASET_WITH_SUMMARIES_NAME="cicero_dataset_with_summaries"
|
405 |
+
DATASET_WITH_FAISS_AND_EMBEDDINGS_NAME="cicero_dataset_with_embeddings_and_faiss_index"
|
406 |
+
HUGGING_FACE_REPOSITORY_NAME="cicero_synthesizer"
|
407 |
+
FAISS_OUTPUT_FILENAME="cicero_faiss_index"
|
408 |
+
|
409 |
+
## Run the data preparation on the input dataset
|
410 |
+
prepare_data:
|
411 |
+
@ $(PYTHON_INTERPRETER) \
|
412 |
+
$(DATA_PROCESSING_DIRECTORY)/prepare_dataset.py \
|
413 |
+
--dataset-path $(DATASET_PATH)
|
414 |
+
|
415 |
+
|
416 |
+
## Run the script for creating a FAISS index and text embeddings of the dataset
|
417 |
+
run_faiss_and_embeddings:
|
418 |
+
@ $(PYTHON_INTERPRETER) \
|
419 |
+
$(TRAINING_DIRECTORY)/create_faiss_corpus_index.py \
|
420 |
+
--dataset-name $(DATASET_WITH_SUMMARIES_NAME) \
|
421 |
+
--output-dataset-name $(DATASET_WITH_FAISS_AND_EMBEDDINGS_NAME) \
|
422 |
+
--repository-name $(HUGGING_FACE_REPOSITORY_NAME) \
|
423 |
+
--faiss-index-name $(FAISS_OUTPUT_FILENAME)
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
###############################################################################
|
428 |
+
# Self Documenting Commands #
|
429 |
+
###############################################################################
|
430 |
+
|
431 |
+
.DEFAULT_GOAL := help
|
432 |
+
|
433 |
+
# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
|
434 |
+
# sed script explained:
|
435 |
+
# /^##/:
|
436 |
+
# * save line in hold space
|
437 |
+
# * purge line
|
438 |
+
# * Loop:
|
439 |
+
# * append newline + line to hold space
|
440 |
+
# * go to next line
|
441 |
+
# * if line starts with doc comment, strip comment character off and loop
|
442 |
+
# * remove target prerequisites
|
443 |
+
# * append hold space (+ newline) to line
|
444 |
+
# * replace newline plus comments by `---`
|
445 |
+
# * print line
|
446 |
+
# Separate expressions are necessary because labels cannot be delimited by
|
447 |
+
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
|
448 |
+
help:
|
449 |
+
@echo "$$(tput bold)Available rules:$$(tput sgr0)"
|
450 |
+
@echo
|
451 |
+
@sed -n -e "/^## / { \
|
452 |
+
h; \
|
453 |
+
s/.*//; \
|
454 |
+
:doc" \
|
455 |
+
-e "H; \
|
456 |
+
n; \
|
457 |
+
s/^## //; \
|
458 |
+
t doc" \
|
459 |
+
-e "s/:.*//; \
|
460 |
+
G; \
|
461 |
+
s/\\n## /---/; \
|
462 |
+
s/\\n/ /g; \
|
463 |
+
p; \
|
464 |
+
}" ${MAKEFILE_LIST} \
|
465 |
+
| LC_ALL='C' sort --ignore-case \
|
466 |
+
| awk -F '---' \
|
467 |
+
-v ncol=$$(tput cols) \
|
468 |
+
-v indent=25 \
|
469 |
+
-v col_on="$$(tput setaf 6)" \
|
470 |
+
-v col_off="$$(tput sgr0)" \
|
471 |
+
'{ \
|
472 |
+
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
|
473 |
+
n = split($$2, words, " "); \
|
474 |
+
line_length = ncol - indent; \
|
475 |
+
for (i = 1; i <= n; i++) { \
|
476 |
+
line_length -= length(words[i]) + 1; \
|
477 |
+
if (line_length <= 0) { \
|
478 |
+
line_length = ncol - indent - length(words[i]) - 1; \
|
479 |
+
printf "\n%*s ", -indent, " "; \
|
480 |
+
} \
|
481 |
+
printf "%s ", words[i]; \
|
482 |
+
} \
|
483 |
+
printf "\n"; \
|
484 |
+
}' \
|
485 |
+
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
|
README.md
CHANGED
@@ -1,11 +1,282 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/Maven-Building-LLMS-into-Production/Cicero-LLM-Synthesizer/code-linting.yml)
|
2 |
+
|
3 |
+
# Cicero LLM Synthesizer
|
4 |
+
|
5 |
+
## Contents
|
6 |
+
|
7 |
+
- [Setup](#setup)
|
8 |
+
- [Setup for local code development](#setup-for-local-code-development)
|
9 |
+
- [Makefile](#makefile)
|
10 |
+
- [Starting up the Docker container and initializing the repository](#starting-up-the-docker-container-and-initializing-the-repository)
|
11 |
+
- [Starting the API service](#starting-the-api-service)
|
12 |
+
- [Starting up all the services](#starting-up-all-the-services)
|
13 |
+
- [Tests](#tests)
|
14 |
+
- [Helpful Commands](#helpful-commands)
|
15 |
+
- [VS Code Extensions](#vs-code-extensions)
|
16 |
+
- [GPT3.5 summaries](#gpt35-summaries)
|
17 |
+
- [Resources](#resources)
|
18 |
+
|
19 |
+
## Setup
|
20 |
+
|
21 |
+
Ensure you have python and pip installed.
|
22 |
+
|
23 |
+
```shell
|
24 |
+
python --version
|
25 |
+
pip --version
|
26 |
+
```
|
27 |
+
|
28 |
+
From the root directory run the following command to install the
|
29 |
+
dependencies: `pip install -r requirements.txt`
|
30 |
+
|
31 |
+
You can run the app using this command: `python -m uvicorn src.api.index:app --reload`
|
32 |
+
|
33 |
+
Once running you can navigate to `http://127.0.0.1:8000/docs` to view the
|
34 |
+
interactive API documentation.
|
35 |
+
|
36 |
+
## Setup for local code development
|
37 |
+
|
38 |
+
There are some steps that need to be done prior to being able to
|
39 |
+
properly run and develop the code in this repository.
|
40 |
+
|
41 |
+
The following is a list of steps that have to happen prior to starting to
|
42 |
+
work / test the pipelines of this repository:
|
43 |
+
|
44 |
+
### Makefile
|
45 |
+
|
46 |
+
The project comes with a `Makefile` (**not supported in Windows!**)
|
47 |
+
that can be used for executing commands that will make the interaction
|
48 |
+
with this project much smoother. Keep in mind that folders with spaces in their names may cause issues.
|
49 |
+
|
50 |
+
One can see all of the available options by:
|
51 |
+
|
52 |
+
```bash
|
53 |
+
$: make
|
54 |
+
|
55 |
+
Available rules:
|
56 |
+
|
57 |
+
add-licenses Add licenses to Python files
|
58 |
+
all-start Starts both the API service and the local development service
|
59 |
+
all-stop Stops both the API service and the local development service
|
60 |
+
all-web Open up all web endpoints
|
61 |
+
api-build Build API Docker image
|
62 |
+
api-start Start API Docker image container
|
63 |
+
api-stop Stop API Docker image container
|
64 |
+
api-web Open API in web browser
|
65 |
+
app-app-build Build App Docker image
|
66 |
+
app-app-start Start App Docker image container
|
67 |
+
app-app-stop Stop App Docker image container
|
68 |
+
app-app-web Open App in web browser
|
69 |
+
clean Removes artifacts from the build stage, and other common Python artifacts.
|
70 |
+
clean-build Remove build artifacts
|
71 |
+
clean-images Clean left-over images
|
72 |
+
clean-model-files Remove files related to pre-trained models
|
73 |
+
clean-pyc Removes Python file artifacts
|
74 |
+
clean-secrets Removes secret artifacts - Serverless
|
75 |
+
clean-test Remove test and coverage artifacts
|
76 |
+
create-environment Creates the Python environment
|
77 |
+
create-envrc Set up the envrc file for the project.
|
78 |
+
delete-environment Deletes the Python environment
|
79 |
+
delete-envrc Delete the local envrc file of the project
|
80 |
+
destroy Remove ALL of the artifacts + Python environments
|
81 |
+
docker-local-dev-build Build local development Docker image
|
82 |
+
docker-local-dev-login Start a shell session into the docker container
|
83 |
+
docker-local-dev-start Start service for local development
|
84 |
+
docker-local-dev-stop Stop service for local development
|
85 |
+
docker-prune Clean Docker images
|
86 |
+
git-flow-install Install git-flow
|
87 |
+
init Initialize the repository for code development
|
88 |
+
lint Run the 'pre-commit' linting step manually
|
89 |
+
pip-upgrade Upgrade the version of the 'pip' package
|
90 |
+
pre-commit-install Installing the pre-commit Git hook
|
91 |
+
pre-commit-uninstall Uninstall the pre-commit Git hook
|
92 |
+
prepare_data Run the data preparation on the input dataset
|
93 |
+
requirements Install Python dependencies into the Python environment
|
94 |
+
run_faiss_and_embeddings Run the script for creating a FAISS index and text embeddings of the dataset
|
95 |
+
show-params Show the set of input parameters
|
96 |
+
sort-requirements Sort the project packages requirements file
|
97 |
+
```
|
98 |
+
|
99 |
+
> **NOTE**: If you're using `Windows`, you may have to copy and modify to some
|
100 |
+
> extents the commands that are part of the `Makefile` for some tasks.
|
101 |
+
|
102 |
+
### Starting up the Docker container and initializing the repository
|
103 |
+
|
104 |
+
In order to work on current / new features, one can use *Docker* to
|
105 |
+
start a new container and start the local development process.
|
106 |
+
|
107 |
+
To build the Docker image, one must follow the following steps:
|
108 |
+
|
109 |
+
1. Start the Docker daemon. If you're using Mac, one can use the
|
110 |
+
Docker Desktop App.
|
111 |
+
2. Go the project's directory and run the following command using the `Makefile`:
|
112 |
+
```bash
|
113 |
+
# Go the project's directory
|
114 |
+
cd /path/to/directory
|
115 |
+
|
116 |
+
# Build the Docker iamge and start a container
|
117 |
+
make docker-local-dev-start
|
118 |
+
```
|
119 |
+
3. Log into the container
|
120 |
+
```bash
|
121 |
+
# Log into the container
|
122 |
+
make docker-local-dev-login
|
123 |
+
```
|
124 |
+
|
125 |
+
4. Once you're inside the container, you'll see the following prompt:
|
126 |
+
|
127 |
+
```bash
|
128 |
+
# Log into the container
|
129 |
+
???$: make docker-local-dev-login
|
130 |
+
direnv: error /opt/program/.envrc is blocked. Run `direnv allow` to approve its content
|
131 |
+
```
|
132 |
+
> One will see the `direnv` error because `direnv` is installed and one must
|
133 |
+
> *allow* the changes to take effect.
|
134 |
+
|
135 |
+
5. Allow for the `direnv` changes
|
136 |
+
```bash
|
137 |
+
# Accept the changes
|
138 |
+
$: direnv allow
|
139 |
+
direnv: loading /opt/program/.envrc
|
140 |
+
```
|
141 |
+
|
142 |
+
6. The last thing is to initialize the repository. This can easily be done
|
143 |
+
with the `init` command:
|
144 |
+
|
145 |
+
```bash
|
146 |
+
$: make init
|
147 |
+
```
|
148 |
+
This will do the following tasks:
|
149 |
+
- Clean Python files
|
150 |
+
- Initialize the `.envrc` file used by `direnv`.
|
151 |
+
- Delete an existing python environment for the project, if it exists.
|
152 |
+
- Creates a new environment, if applicable
|
153 |
+
- Apply `direnv allow` to allow for `direnv` modifications.
|
154 |
+
- Install package requirements via `pip`
|
155 |
+
- Install `pre-commit` for code-linting and code-checking.
|
156 |
+
- Install `git-flow`, whenever possible.
|
157 |
+
|
158 |
+
These steps allow for the user to be able to develop new feature within
|
159 |
+
Docker, which makes it easier for developers to have the exact same set of
|
160 |
+
tools available.
|
161 |
+
|
162 |
+
## Starting the API service
|
163 |
+
|
164 |
+
The project comes with an out-of-the-box solution for starting and stopping
|
165 |
+
the API endpoint via Docker.
|
166 |
+
|
167 |
+
To start the container with the API endpoint, one must run the following
|
168 |
+
command:
|
169 |
+
|
170 |
+
```bash
|
171 |
+
# Start API service
|
172 |
+
make api-start
|
173 |
+
```
|
174 |
+
|
175 |
+
This service will start a Docker container that exposes the internal port
|
176 |
+
`7860` to the local host's port `7860`. Once the image has been built and
|
177 |
+
a container has started, one can go to the service's main page by using
|
178 |
+
the following command:
|
179 |
+
|
180 |
+
```bash
|
181 |
+
# Go the URL of the API endpoint
|
182 |
+
make api-web
|
183 |
+
```
|
184 |
+
|
185 |
+
> This will direct the user to the following URL:
|
186 |
+
> [http://localhost:7860/docs](http://localhost:7860/docs)
|
187 |
+
|
188 |
+
In order to *stop* the API service, one can run the following command:
|
189 |
+
|
190 |
+
```bash
|
191 |
+
# Stop the API service
|
192 |
+
make api-stop
|
193 |
+
```
|
194 |
+
|
195 |
+
As one customizes the FastAPI with new features and more, these changes
|
196 |
+
will be automatically displayed in the URL from above.
|
197 |
+
|
198 |
+
### Starting up all the services
|
199 |
+
|
200 |
+
Similar to the sections from above, one can spin up or spin down all the
|
201 |
+
services at once with the help of 2 commands, i.e. `all-start` and `all-stop`.
|
202 |
+
|
203 |
+
In order to spin up both the *api* service and that for *local development*,
|
204 |
+
one can run:
|
205 |
+
|
206 |
+
```bash
|
207 |
+
make all-start
|
208 |
+
```
|
209 |
+
|
210 |
+
This command will execute both services and one will be able to log into the
|
211 |
+
container for local development, as well to connect to the API via the
|
212 |
+
browser.
|
213 |
+
|
214 |
+
Similarly, in order to spin down all of the services, one can simply run:
|
215 |
+
|
216 |
+
```bash
|
217 |
+
make all-stop
|
218 |
+
```
|
219 |
+
|
220 |
+
This will stop both services and delete any unused Docker containers.
|
221 |
+
|
222 |
+
## Tests
|
223 |
+
|
224 |
+
Unit tests can be found under the `src` folder alongside source code.
|
225 |
+
Test files end with `_test`. The following command will run all of the tests.
|
226 |
+
|
227 |
+
```shell
|
228 |
+
python -m pytest -v -s
|
229 |
+
```
|
230 |
+
|
231 |
+
The `-v` argument is for verbose output. The `-s` argument is for turning
|
232 |
+
off the capture mode so that print statements are printed to the console.
|
233 |
+
|
234 |
+
A Makefile command also exists to run these. See `make test`.
|
235 |
+
|
236 |
+
## Helpful Commands
|
237 |
+
|
238 |
+
Here is a list of commands that may be helpful when interacting with this project.
|
239 |
+
|
240 |
+
### Docker
|
241 |
+
|
242 |
+
List all Docker containers:
|
243 |
+
|
244 |
+
```shell
|
245 |
+
docker ps -a
|
246 |
+
```
|
247 |
+
|
248 |
+
## VS Code Extensions
|
249 |
+
|
250 |
+
To help facilitate local development you can install
|
251 |
+
the [Visual Studio Code Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
|
252 |
+
extension for VS Code. This will allow you to connect to the local development Docker container and more easily develop features.
|
253 |
+
|
254 |
+
## GPT3.5 summaries
|
255 |
+
|
256 |
+
To generate the GPT3.5 summaries for all articles, use the following commands:
|
257 |
+
|
258 |
+
```
|
259 |
+
cd src
|
260 |
+
python3 -m utils.gpt35_summaries.cleanup_and_summarize
|
261 |
+
```
|
262 |
+
|
263 |
+
The output CSV file is placed in `src/utils/gpt35_summaries/df_embed_out.csv`
|
264 |
+
The pre-generated summaries for all articles are in `df_embed_out2.csv` in the same directory.
|
265 |
+
|
266 |
+
For an example of a focussed summary, please see `src/focused_summary_example.py`.
|
267 |
+
|
268 |
+
## Resources
|
269 |
+
|
270 |
+
- [direnv](https://github.com/direnv/direnv)
|
271 |
+
- [Docker](https://docs.docker.com/reference/)
|
272 |
+
- [Docker Compose](https://docs.docker.com/compose/)
|
273 |
+
- [FastAPI](https://fastapi.tiangolo.com/)
|
274 |
+
- [flake8](https://flake8.pycqa.org/en/latest/)
|
275 |
+
- [git](https://git-scm.com/)
|
276 |
+
- [GitHub Actions](https://docs.github.com/en/actions)
|
277 |
+
- [isort](https://pycqa.github.io/isort/index.html)
|
278 |
+
- [Makefile](https://www.gnu.org/software/make/manual/make.html)
|
279 |
+
- [Markdown](https://www.markdownguide.org/)
|
280 |
+
- [pre-commit](https://pre-commit.com)
|
281 |
+
- [Python](https://www.python.org/)
|
282 |
+
- [tmux](https://github.com/tmux/tmux/wiki/Getting-Started)
|
data/cicero_faiss_index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bf0b06752ccad29f09484b07bf65d429ec60a203e3697cc14661cee28447d37
|
3 |
+
size 3511341
|
data/clean_dataset.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea0aafebd94acbbe0dbd7370d01a34b49284d5012ffadf3fe95130e83f2f13fd
|
3 |
+
size 13751626
|
data/raw_dataset.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2bb32f8b3a313d28aa704e84335ae9d8aefa039f405677a5054aeee67a75cbeb
|
3 |
+
size 15756643
|
docker/aliases.sh
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is a compilation of useful 'USER-DEFINED' aliases to use
|
2 |
+
|
3 |
+
alias tmux_create='tmux new -s' # Creates new tmux session.
|
4 |
+
alias tmux_attach='tmux a -t' # Attaches to an existing tmux session.
|
5 |
+
alias tmux_ls='tmux ls' # Lists all of the existing tmux sessions.
|
6 |
+
alias tmux_kill="tmux kill-session -t " # Kill a specific tmux session
|
7 |
+
alias gadd='git add' # Adds a file / directory to repository
|
8 |
+
alias gcom='git commit -m' # Commits any changes. Use as: gcom "Test"
|
9 |
+
alias gp='git push origin master' # Pushes changes to 'master'
|
10 |
+
alias gst='git status' # Shows the status of the GIT repository.
|
11 |
+
alias sagent="eval $(ssh-agent -s)" # Start SSH key agent
|
12 |
+
alias sa='conda activate' # Activates an Anaconda environment
|
13 |
+
alias sd='conda deactivate' # Deactivates an Anaconda environment
|
14 |
+
alias jl='jupyter lab --ip 0.0.0.0 --port 8890 --no-browser --allow-root' # Opens 'Jupyter Lab'
|
15 |
+
alias jn='jupyter notebook --ip 0.0.0.0 --port 8890 --no-browser --allow-root' # Opens 'Jupyter Notebook'
|
16 |
+
alias lll="ls -lah"
|
17 |
+
# Docker-related
|
18 |
+
alias dps="docker ps -a"
|
19 |
+
alias dprune='docker system prune -f'
|
20 |
+
alias dallow="direnv allow"
|
docker/docker-compose.yaml
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3'
|
2 |
+
|
3 |
+
services:
|
4 |
+
# --- Service used for local development
|
5 |
+
local-dev:
|
6 |
+
# Building the local image
|
7 |
+
build:
|
8 |
+
context: ../
|
9 |
+
dockerfile: ./Dockerfile
|
10 |
+
# Running the local image
|
11 |
+
image: "cicero-synthesizer-local-dev"
|
12 |
+
container_name: "cicero-synthesizer-local-dev"
|
13 |
+
environment:
|
14 |
+
DOCKER_BUILDKIT_VALUE: ${DOCKER_BUILDKIT_VALUE}
|
15 |
+
HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN}
|
16 |
+
volumes:
|
17 |
+
- ${HOME}/.ssh:/root/.ssh
|
18 |
+
- /var/run/docker.sock:/var/run/docker.sock
|
19 |
+
- ..:/opt/program
|
20 |
+
working_dir: /opt/program
|
21 |
+
command: [ "/bin/sleep", "365d" ]
|
22 |
+
#
|
23 |
+
# --- Service for running the API locally
|
24 |
+
api:
|
25 |
+
# Building the local image
|
26 |
+
build:
|
27 |
+
context: ../
|
28 |
+
dockerfile: ./Dockerfile
|
29 |
+
# Running the local image
|
30 |
+
image: "cicero-synthesizer-api"
|
31 |
+
container_name: "cicero-synthesizer-api"
|
32 |
+
environment:
|
33 |
+
HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN}
|
34 |
+
HUGGING_FACE_USERNAME: ${HUGGING_FACE_USERNAME}
|
35 |
+
volumes:
|
36 |
+
- ..:/opt/ml
|
37 |
+
ports:
|
38 |
+
- ${INPUT_APP_PORT:-8501}:${OUTPUT_APP_PORT:-8501}
|
39 |
+
working_dir: /opt/ml
|
40 |
+
command:
|
41 |
+
[
|
42 |
+
"uvicorn",
|
43 |
+
"src.api.index:app",
|
44 |
+
"--host",
|
45 |
+
"0.0.0.0",
|
46 |
+
"--port",
|
47 |
+
"8501",
|
48 |
+
"--reload",
|
49 |
+
"--reload-dir",
|
50 |
+
"/opt/ml"
|
51 |
+
]
|
52 |
+
#
|
53 |
+
# --- Service for running the Gradio application locally
|
54 |
+
app:
|
55 |
+
# Building the local image
|
56 |
+
build:
|
57 |
+
context: ../
|
58 |
+
dockerfile: ./Dockerfile
|
59 |
+
# Running the local image
|
60 |
+
image: "cicero-synthesizer-app"
|
61 |
+
container_name: "cicero-synthesizer-app"
|
62 |
+
environment:
|
63 |
+
APP_SERVER_PORT: ${APP_SERVER_PORT}
|
64 |
+
HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN}
|
65 |
+
HUGGING_FACE_USERNAME: ${HUGGING_FACE_USERNAME}
|
66 |
+
volumes:
|
67 |
+
- ..:/opt/ml
|
68 |
+
ports:
|
69 |
+
- ${APP_SERVER_PORT:-7860}:${APP_SERVER_PORT:-7860}
|
70 |
+
working_dir: /opt/ml
|
71 |
+
command: [ "python", "src/app_service/app.py" ]
|
pyproject.toml
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.isort]
|
2 |
+
force_grid_wrap = 0
|
3 |
+
include_trailing_comma = true
|
4 |
+
line_length = 79
|
5 |
+
multi_line_output = 3
|
6 |
+
use_parentheses = true
|
7 |
+
|
8 |
+
[tool.pytest.ini_options]
|
9 |
+
addopts = """
|
10 |
+
--cov-report term-missing \
|
11 |
+
--cov src/ -ra"""
|
12 |
+
|
13 |
+
[tool.black]
|
14 |
+
exclude = '''
|
15 |
+
/(
|
16 |
+
\.eggs
|
17 |
+
| \.git
|
18 |
+
| \.hg
|
19 |
+
| \.mypy_cache
|
20 |
+
| \.tox
|
21 |
+
| \.venv
|
22 |
+
| _build
|
23 |
+
| buck-out
|
24 |
+
| build
|
25 |
+
| dist
|
26 |
+
# The following are specific to Black, you probably don't want those.
|
27 |
+
| blib2to3
|
28 |
+
| tests/data
|
29 |
+
| profiling
|
30 |
+
)/
|
31 |
+
'''
|
32 |
+
include = '\.pyi?$'
|
33 |
+
line-length = 79
|
34 |
+
target-version = ['py36', 'py37', 'py38', 'py39']
|
requirements-deploy.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
docker==5.0.3
|
requirements-dev.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-r ./requirements.txt
|
2 |
+
black==22.3.0
|
3 |
+
click>=8.0.2
|
4 |
+
docker==5.0.3
|
5 |
+
flake8==5.0.4
|
6 |
+
ipython>=7.0.1
|
7 |
+
isort>=4.3.21, <5.0
|
8 |
+
jupyter_client>=5.1, <7.0
|
9 |
+
jupyterlab>=0.31.1
|
10 |
+
jupyter~=1.0
|
11 |
+
pre-commit==2.10.1
|
12 |
+
protobuf==3.20.1
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets>=2.13.1
|
2 |
+
faiss-cpu>=1.7.4
|
3 |
+
fastapi==0.92.0
|
4 |
+
gradio>=3.35.2
|
5 |
+
huggingface_hub>=0.15.1
|
6 |
+
openai>=0.27.8
|
7 |
+
pandas>=2.0.2
|
8 |
+
sentence-transformers
|
9 |
+
setuptools==67.6.1
|
10 |
+
spacy>=3.5.3
|
11 |
+
tiktoken>=0.4.0
|
12 |
+
torch==2.0.0
|
13 |
+
tqdm==4.65.0
|
14 |
+
uvicorn==0.20.0
|
src/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
src/api/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
src/api/index.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
import logging
|
24 |
+
from typing import Dict, Optional
|
25 |
+
|
26 |
+
from datasets import Dataset
|
27 |
+
from fastapi import Depends, FastAPI
|
28 |
+
from fastapi.responses import RedirectResponse
|
29 |
+
from huggingface_hub import hf_hub_download
|
30 |
+
from pydantic import BaseModel
|
31 |
+
|
32 |
+
from src.classes import hugging_face_utils as hf
|
33 |
+
from src.classes import semantic_search_engine as ss
|
34 |
+
from src.utils import default_variables as dv
|
35 |
+
|
36 |
+
logger = logging.getLogger(__name__)
|
37 |
+
logging.basicConfig(
|
38 |
+
level=logging.INFO,
|
39 |
+
format="%(asctime)s [%(levelname)s]: %(message)s",
|
40 |
+
)
|
41 |
+
logger.setLevel(logging.INFO)
|
42 |
+
|
43 |
+
|
44 |
+
# ------------------------------- VARIABLES -----------------------------------
|
45 |
+
|
46 |
+
APP_TITLE = "Cicero LLM Synthesizer"
|
47 |
+
APP_DESCRIPTION = f"""
|
48 |
+
The '{APP_TITLE}'is an app that will identify the top-N articles from the
|
49 |
+
Cicero database that are most similar to the user's input query.
|
50 |
+
"""
|
51 |
+
APP_VERSION = "0.1"
|
52 |
+
|
53 |
+
|
54 |
+
# ----------------------------- APP-SPECIFIC ----------------------------------
|
55 |
+
|
56 |
+
# Defining the appliation value
|
57 |
+
app = FastAPI(
|
58 |
+
title=APP_TITLE,
|
59 |
+
description=APP_DESCRIPTION,
|
60 |
+
version=APP_VERSION,
|
61 |
+
)
|
62 |
+
|
63 |
+
# -------------------------------- CLASSES ------------------------------------
|
64 |
+
|
65 |
+
|
66 |
+
class QueryParams(BaseModel):
|
67 |
+
input_query: str
|
68 |
+
number_articles: Optional[int] = 5
|
69 |
+
|
70 |
+
|
71 |
+
# ------------------------------- FUNCTIONS -----------------------------------
|
72 |
+
|
73 |
+
|
74 |
+
def download_dataset_and_faiss_index() -> Dataset:
|
75 |
+
"""
|
76 |
+
Function to download the corresponding dataset and the FAISS index
|
77 |
+
from HuggingFace.
|
78 |
+
|
79 |
+
Returns
|
80 |
+
-------------
|
81 |
+
dataset_with_faiss_index : datasets.Dataset
|
82 |
+
Dataset from HuggingFace with the FAISS index loaded.
|
83 |
+
"""
|
84 |
+
# --- Initializing HuggingFace API
|
85 |
+
# Object for interacting with HuggingFace
|
86 |
+
hf_obj = hf.HuggingFaceHelper()
|
87 |
+
|
88 |
+
# Defining variable names for each of the objects
|
89 |
+
faiss_index_name = f"{dv.faiss_index_name}.faiss"
|
90 |
+
dataset_name = dv.dataset_faiss_embeddings_name
|
91 |
+
username = hf_obj.username
|
92 |
+
repository_name = dv.hugging_face_repository_name
|
93 |
+
repository_id = f"{username}/{repository_name}"
|
94 |
+
repository_type = "dataset"
|
95 |
+
split_type = "train"
|
96 |
+
|
97 |
+
# --- Downloading FAISS Index
|
98 |
+
faiss_index_local_path = hf_hub_download(
|
99 |
+
repo_id=repository_id,
|
100 |
+
filename=faiss_index_name,
|
101 |
+
repo_type=repository_type,
|
102 |
+
token=hf_obj.api.token,
|
103 |
+
)
|
104 |
+
|
105 |
+
# --- Downloading Dataset
|
106 |
+
dataset_obj = hf_obj.get_dataset_from_hub(
|
107 |
+
dataset_name=dataset_name,
|
108 |
+
username=username,
|
109 |
+
split=split_type,
|
110 |
+
)
|
111 |
+
|
112 |
+
# --- Adding FAISS index to the dataset
|
113 |
+
dataset_obj.load_faiss_index(
|
114 |
+
index_name=dv.embeddings_colname,
|
115 |
+
file=faiss_index_local_path,
|
116 |
+
)
|
117 |
+
|
118 |
+
return dataset_obj
|
119 |
+
|
120 |
+
|
121 |
+
def run_semantic_search_task(query: str, number_articles: int) -> Dict:
|
122 |
+
"""
|
123 |
+
Function to run semantic search on an input query. It will return a
|
124 |
+
set of 'Top-N' articles that are most similar to the input query.
|
125 |
+
|
126 |
+
Parameters
|
127 |
+
------------
|
128 |
+
query : str
|
129 |
+
Input query to use when running the Semantic Search Engine.
|
130 |
+
|
131 |
+
number_articles : int
|
132 |
+
Number of articles to return from the Semantic Search.
|
133 |
+
|
134 |
+
Returns
|
135 |
+
----------
|
136 |
+
ranked_results : dict
|
137 |
+
Dictionary containing the ranked results from the Semantic
|
138 |
+
Search Engine.
|
139 |
+
"""
|
140 |
+
# --- Extracting dataset with FAISS index
|
141 |
+
corpus_dataset_with_faiss_index = download_dataset_and_faiss_index()
|
142 |
+
|
143 |
+
# --- Initializing Semantic Search Engine
|
144 |
+
semantic_search_obj = ss.SemanticSearchEngine(
|
145 |
+
corpus_dataset_with_faiss_index=corpus_dataset_with_faiss_index
|
146 |
+
)
|
147 |
+
|
148 |
+
# --- Running search on Top-N results
|
149 |
+
|
150 |
+
return semantic_search_obj.run_semantic_search(
|
151 |
+
query=query,
|
152 |
+
top_n=number_articles,
|
153 |
+
)
|
154 |
+
|
155 |
+
|
156 |
+
# -------------------------------- ROUTES -------------------------------------
|
157 |
+
|
158 |
+
|
159 |
+
@app.get("/", include_in_schema=False)
|
160 |
+
async def docs_redirect():
|
161 |
+
return RedirectResponse(url="/docs")
|
162 |
+
|
163 |
+
|
164 |
+
# ---- Semantic Search
|
165 |
+
@app.post("/predict")
|
166 |
+
async def run_semantic_search(query_params: QueryParams = Depends()):
|
167 |
+
"""
|
168 |
+
Function to run semantic search on the an input query.
|
169 |
+
|
170 |
+
Parameters
|
171 |
+
--------------
|
172 |
+
query : str
|
173 |
+
Input query to use when running the Semantic Search Engine.
|
174 |
+
|
175 |
+
number_articles : int
|
176 |
+
Number of articles to return from the Semantic Search.
|
177 |
+
"""
|
178 |
+
|
179 |
+
return run_semantic_search_task(
|
180 |
+
query=query_params.input_query,
|
181 |
+
number_articles=query_params.number_articles,
|
182 |
+
)
|
src/app_service/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
src/app_service/app.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
import logging
|
24 |
+
from typing import Dict
|
25 |
+
|
26 |
+
import gradio as gr
|
27 |
+
from datasets import Dataset
|
28 |
+
from huggingface_hub import hf_hub_download
|
29 |
+
|
30 |
+
from src.classes import hugging_face_utils as hf
|
31 |
+
from src.classes import semantic_search_engine as ss
|
32 |
+
from src.utils import default_variables as dv
|
33 |
+
|
34 |
+
logger = logging.getLogger(__name__)
|
35 |
+
logging.basicConfig(
|
36 |
+
level=logging.INFO,
|
37 |
+
format="%(asctime)s [%(levelname)s]: %(message)s",
|
38 |
+
)
|
39 |
+
logger.setLevel(logging.INFO)
|
40 |
+
|
41 |
+
|
42 |
+
# ------------------------------ VARIABLES ------------------------------------
|
43 |
+
|
44 |
+
APP_TITLE = "Cicero LLM Synthesizer"
|
45 |
+
APP_DESCRIPTION = f"""
|
46 |
+
The '{APP_TITLE}'is an app that will identify the top-N articles from the
|
47 |
+
Cicero database that are most similar to the user's input query.
|
48 |
+
"""
|
49 |
+
APP_VERSION = "0.1"
|
50 |
+
|
51 |
+
|
52 |
+
# ------------------------------ FUNCTIONS ------------------------------------
|
53 |
+
|
54 |
+
|
55 |
+
def download_dataset_and_faiss_index() -> Dataset:
|
56 |
+
"""
|
57 |
+
Function to download the corresponding dataset and the FAISS index
|
58 |
+
from HuggingFace.
|
59 |
+
|
60 |
+
Returns
|
61 |
+
-------------
|
62 |
+
dataset_with_faiss_index : datasets.Dataset
|
63 |
+
Dataset from HuggingFace with the FAISS index loaded.
|
64 |
+
"""
|
65 |
+
# --- Initializing HuggingFace API
|
66 |
+
# Object for interacting with HuggingFace
|
67 |
+
hf_obj = hf.HuggingFaceHelper()
|
68 |
+
|
69 |
+
# Defining variable names for each of the objects
|
70 |
+
faiss_index_name = f"{dv.faiss_index_name}.faiss"
|
71 |
+
dataset_name = dv.dataset_faiss_embeddings_name
|
72 |
+
username = hf_obj.username
|
73 |
+
repository_name = dv.hugging_face_repository_name
|
74 |
+
repository_id = f"{username}/{repository_name}"
|
75 |
+
repository_type = "dataset"
|
76 |
+
split_type = "train"
|
77 |
+
|
78 |
+
# --- Downloading FAISS Index
|
79 |
+
faiss_index_local_path = hf_hub_download(
|
80 |
+
repo_id=repository_id,
|
81 |
+
filename=faiss_index_name,
|
82 |
+
repo_type=repository_type,
|
83 |
+
token=hf_obj.api.token,
|
84 |
+
)
|
85 |
+
|
86 |
+
# --- Downloading Dataset
|
87 |
+
dataset_obj = hf_obj.get_dataset_from_hub(
|
88 |
+
dataset_name=dataset_name,
|
89 |
+
username=username,
|
90 |
+
split=split_type,
|
91 |
+
)
|
92 |
+
|
93 |
+
# --- Adding FAISS index to the dataset
|
94 |
+
dataset_obj.load_faiss_index(
|
95 |
+
index_name=dv.embeddings_colname,
|
96 |
+
file=faiss_index_local_path,
|
97 |
+
)
|
98 |
+
|
99 |
+
return dataset_obj
|
100 |
+
|
101 |
+
|
102 |
+
def run_semantic_search_task(query: str, number_articles: int) -> Dict:
|
103 |
+
# sourcery skip: remove-unnecessary-cast
|
104 |
+
"""
|
105 |
+
Function to run semantic search on an input query. It will return a
|
106 |
+
set of 'Top-N' articles that are most similar to the input query.
|
107 |
+
|
108 |
+
Parameters
|
109 |
+
------------
|
110 |
+
query : str
|
111 |
+
Input query to use when running the Semantic Search Engine.
|
112 |
+
|
113 |
+
number_articles : int
|
114 |
+
Number of articles to return from the Semantic Search.
|
115 |
+
|
116 |
+
Returns
|
117 |
+
----------
|
118 |
+
ranked_results : dict
|
119 |
+
Dictionary containing the ranked results from the Semantic
|
120 |
+
Search Engine.
|
121 |
+
"""
|
122 |
+
# --- Extracting dataset with FAISS index
|
123 |
+
corpus_dataset_with_faiss_index = download_dataset_and_faiss_index()
|
124 |
+
|
125 |
+
# --- Initializing Semantic Search Engine
|
126 |
+
semantic_search_obj = ss.SemanticSearchEngine(
|
127 |
+
corpus_dataset_with_faiss_index=corpus_dataset_with_faiss_index
|
128 |
+
)
|
129 |
+
|
130 |
+
# --- Running search on Top-N results
|
131 |
+
number_articles_mod = int(number_articles)
|
132 |
+
|
133 |
+
results = semantic_search_obj.run_semantic_search(
|
134 |
+
query=query,
|
135 |
+
top_n=number_articles_mod,
|
136 |
+
)
|
137 |
+
|
138 |
+
return list(results.values())
|
139 |
+
|
140 |
+
|
141 |
+
# --------------------------------- APP ---------------------------------------
|
142 |
+
|
143 |
+
# -- Semantic Search Engine
|
144 |
+
semantic_search_engine = gr.Interface(
|
145 |
+
fn=run_semantic_search_task,
|
146 |
+
inputs=[
|
147 |
+
gr.components.Textbox(label="Input Query"),
|
148 |
+
gr.Slider(
|
149 |
+
minimum=1,
|
150 |
+
label="Choose number of documents to retrieve",
|
151 |
+
step=1,
|
152 |
+
),
|
153 |
+
],
|
154 |
+
outputs="json",
|
155 |
+
title=APP_TITLE,
|
156 |
+
description=APP_DESCRIPTION,
|
157 |
+
)
|
158 |
+
|
159 |
+
|
160 |
+
# ----------------------------- RUNNING APP -----------------------------------
|
161 |
+
|
162 |
+
if __name__ == "__main__":
|
163 |
+
semantic_search_engine.launch(
|
164 |
+
debug=False,
|
165 |
+
share=False,
|
166 |
+
server_port=7860,
|
167 |
+
)
|
src/classes/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
src/classes/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (184 Bytes). View file
|
|
src/classes/__pycache__/hugging_face_utils.cpython-39.pyc
ADDED
Binary file (5.01 kB). View file
|
|
src/classes/__pycache__/semantic_search_engine.cpython-39.pyc
ADDED
Binary file (5.86 kB). View file
|
|
src/classes/data_preparation.py
ADDED
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
"""
|
24 |
+
Module that contains the class definitions for the data preparation tasks.
|
25 |
+
"""
|
26 |
+
|
27 |
+
import logging
|
28 |
+
import re
|
29 |
+
from datetime import datetime
|
30 |
+
from typing import List, Optional, Tuple, Union
|
31 |
+
|
32 |
+
import pandas as pd
|
33 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
34 |
+
|
35 |
+
from src.classes import hugging_face_utils as hf
|
36 |
+
from src.utils import default_variables as dv
|
37 |
+
from src.utils import general_utilities as gu
|
38 |
+
|
39 |
+
logger = logging.getLogger(__name__)
|
40 |
+
logging.basicConfig(level=logging.INFO)
|
41 |
+
|
42 |
+
# ---------------------------- CLASS DEFINITIONS ------------------------------
|
43 |
+
|
44 |
+
|
45 |
+
# -- Defining functions that can be used for cleaning up and preparing text
|
46 |
+
class NLPPrep(object):
|
47 |
+
"""
|
48 |
+
Class object for handling the data processing of text.
|
49 |
+
"""
|
50 |
+
|
51 |
+
def __init__(self):
|
52 |
+
# Defining the corresponding stop words
|
53 |
+
self.stop_words = list(STOP_WORDS)
|
54 |
+
|
55 |
+
def _lowercase_text(self, input_string: str) -> str:
|
56 |
+
"""
|
57 |
+
Method for making the input text lowercase.
|
58 |
+
|
59 |
+
Parameters
|
60 |
+
------------
|
61 |
+
input_string : str
|
62 |
+
Text variable to lowercase.
|
63 |
+
|
64 |
+
Returns
|
65 |
+
----------
|
66 |
+
output_string : str
|
67 |
+
Lower-cased version of ``input_string``.
|
68 |
+
"""
|
69 |
+
|
70 |
+
return input_string.lower()
|
71 |
+
|
72 |
+
def _only_keep_alphanumeric(self, input_string: str) -> str:
|
73 |
+
"""
|
74 |
+
Method for only keeping alphanumerical characters in the text.
|
75 |
+
|
76 |
+
Parameters
|
77 |
+
------------
|
78 |
+
input_string : str
|
79 |
+
Text variable to filter.
|
80 |
+
|
81 |
+
Returns
|
82 |
+
----------
|
83 |
+
output_string : str
|
84 |
+
Filtered version of ``input_string`` that only contains
|
85 |
+
alphanumerical characters.
|
86 |
+
"""
|
87 |
+
regex_pattern = r"[^a-zA-z0-9\s]"
|
88 |
+
|
89 |
+
return re.sub(regex_pattern, "", input_string)
|
90 |
+
|
91 |
+
def _remove_stopwords(self, input_string: str) -> str:
|
92 |
+
"""
|
93 |
+
Method for removing stop words from the input text.
|
94 |
+
|
95 |
+
Parameters
|
96 |
+
------------
|
97 |
+
input_string : str
|
98 |
+
Text variable to filter.
|
99 |
+
|
100 |
+
Returns
|
101 |
+
----------
|
102 |
+
output_string : str
|
103 |
+
Filtered version of ``input_string`` without stop words in
|
104 |
+
the text.
|
105 |
+
"""
|
106 |
+
# Splitting the text into 'tokens'
|
107 |
+
tokens = input_string.strip().split()
|
108 |
+
|
109 |
+
return " ".join(
|
110 |
+
[word for word in tokens if word not in self.stop_words]
|
111 |
+
)
|
112 |
+
|
113 |
+
def _remove_unicode(self, input_str: str) -> str:
|
114 |
+
"""
|
115 |
+
Method for removing Unicode from the input text.
|
116 |
+
|
117 |
+
Parameters
|
118 |
+
------------
|
119 |
+
input_str : str
|
120 |
+
Text variable, from which to remove Unicode characters.
|
121 |
+
|
122 |
+
Returns
|
123 |
+
----------
|
124 |
+
string_decode : str
|
125 |
+
Filtered version of ``input_str`` without the Unicode characters.
|
126 |
+
"""
|
127 |
+
string_encode = input_str.encode("ascii", "ignore")
|
128 |
+
|
129 |
+
return string_encode.decode()
|
130 |
+
|
131 |
+
def process_text(self, input_string: str) -> str:
|
132 |
+
"""
|
133 |
+
Method for passing the input variable through NLP-based techniques
|
134 |
+
to process the text.
|
135 |
+
|
136 |
+
Parameters
|
137 |
+
------------
|
138 |
+
input_string : str
|
139 |
+
Variable corresponding to the text that will be processed.
|
140 |
+
|
141 |
+
Returns
|
142 |
+
------------
|
143 |
+
processed_string : str
|
144 |
+
Variable corresponding to the *processed* version of the input
|
145 |
+
string, after having gone through some NLP-based processing
|
146 |
+
techniques.
|
147 |
+
|
148 |
+
Notes
|
149 |
+
-----------
|
150 |
+
This function will perform the following NLP-based techniques:
|
151 |
+
|
152 |
+
1. Make the text lowercase.
|
153 |
+
2. Remove any non-alphanumeric character from the string.
|
154 |
+
3. Remove any stop words from the text.
|
155 |
+
"""
|
156 |
+
# Remove Unicode characters
|
157 |
+
processed_string = self._remove_unicode(input_string)
|
158 |
+
# Lower case the text
|
159 |
+
processed_string = self._lowercase_text(processed_string)
|
160 |
+
# Removing non-alphanumeric characters
|
161 |
+
processed_string = self._only_keep_alphanumeric(processed_string)
|
162 |
+
# Removing stop words
|
163 |
+
processed_string = self._remove_stopwords(processed_string)
|
164 |
+
|
165 |
+
return processed_string
|
166 |
+
|
167 |
+
|
168 |
+
class DatasetPrep(object):
|
169 |
+
"""
|
170 |
+
Class object for the Data Processing of the input dataset.
|
171 |
+
"""
|
172 |
+
|
173 |
+
def __init__(
|
174 |
+
self,
|
175 |
+
dataset_path: str,
|
176 |
+
**kwargs,
|
177 |
+
):
|
178 |
+
"""
|
179 |
+
Class object for the Data Processing of the input dataset.
|
180 |
+
|
181 |
+
Parameters
|
182 |
+
------------
|
183 |
+
dataset_path : str
|
184 |
+
Path / URL to the input dataset.
|
185 |
+
"""
|
186 |
+
# Path to the output dataset
|
187 |
+
self.datasets_dir = gu.get_project_paths()["data"]
|
188 |
+
|
189 |
+
# Other parameters
|
190 |
+
for colname in [
|
191 |
+
"save_to_disk",
|
192 |
+
"document_id_colname",
|
193 |
+
"title_colname",
|
194 |
+
"content_colname",
|
195 |
+
"clean_content_colname",
|
196 |
+
]:
|
197 |
+
setattr(self, colname, kwargs.get(colname, getattr(dv, colname)))
|
198 |
+
|
199 |
+
# Initializing dataset
|
200 |
+
self.dataset_path = dataset_path
|
201 |
+
self.raw_dataset = self._get_dataset()
|
202 |
+
|
203 |
+
# Extracting the number of rows and columns, and column names
|
204 |
+
(
|
205 |
+
self.n_rows,
|
206 |
+
self.n_cols,
|
207 |
+
self.columns_names,
|
208 |
+
) = self._get_columns_and_shape()
|
209 |
+
|
210 |
+
# Initializing NLP-Prep Object
|
211 |
+
self.nlp_obj = NLPPrep()
|
212 |
+
|
213 |
+
def show_params(self):
|
214 |
+
"""
|
215 |
+
Method for displaying the set of input parameters of the class.
|
216 |
+
"""
|
217 |
+
|
218 |
+
gu.show_params(
|
219 |
+
params_dict=self.__dict__,
|
220 |
+
logger=logger,
|
221 |
+
columns_to_omit=["raw_dataset"],
|
222 |
+
)
|
223 |
+
|
224 |
+
def _get_dataset(self) -> pd.DataFrame:
|
225 |
+
# sourcery skip: class-extract-method
|
226 |
+
"""
|
227 |
+
Method for extracting the dataset from the input source.
|
228 |
+
|
229 |
+
Returns
|
230 |
+
----------
|
231 |
+
raw_dataset : pandas.DataFrame
|
232 |
+
DataFrame containing the data from the input source.
|
233 |
+
"""
|
234 |
+
logger.info(f">> Extracting dataset from `{self.dataset_path}`")
|
235 |
+
|
236 |
+
# Reading in dataset
|
237 |
+
raw_dataset = pd.read_csv(self.dataset_path)
|
238 |
+
|
239 |
+
# Saving to disk, if applicable
|
240 |
+
if self.save_to_disk:
|
241 |
+
dataset_filepath = self.datasets_dir.joinpath("raw_dataset.csv")
|
242 |
+
dataset_filepath.parent.mkdir(exist_ok=True, parents=True)
|
243 |
+
raw_dataset.to_csv(dataset_filepath, header=True, index=True)
|
244 |
+
|
245 |
+
logger.info(f">> Raw dataset saved to '{str(dataset_filepath)}'")
|
246 |
+
|
247 |
+
return raw_dataset
|
248 |
+
|
249 |
+
def _get_columns_and_shape(self) -> Tuple[int, int, List]:
|
250 |
+
# sourcery skip: use-fstring-for-formatting
|
251 |
+
"""
|
252 |
+
Method for extracting the columns and information about the
|
253 |
+
raw dataset.
|
254 |
+
|
255 |
+
Returns
|
256 |
+
----------
|
257 |
+
n_rows : int
|
258 |
+
Number of rows in the original dataset.
|
259 |
+
|
260 |
+
n_cols : int
|
261 |
+
Number of columns in the original dataset.
|
262 |
+
|
263 |
+
column_names_arr : list
|
264 |
+
List of columns from the original dataset.
|
265 |
+
"""
|
266 |
+
# Number of rows and columns
|
267 |
+
n_rows, n_columns = self.raw_dataset.shape
|
268 |
+
|
269 |
+
logger.info(
|
270 |
+
">> There are '{}' rows and '{}' columns in the dataset".format(
|
271 |
+
n_rows,
|
272 |
+
n_columns,
|
273 |
+
)
|
274 |
+
)
|
275 |
+
|
276 |
+
# Column names
|
277 |
+
column_names_arr = sorted(self.raw_dataset.columns)
|
278 |
+
|
279 |
+
logger.info(
|
280 |
+
">> Columns in the dataset: \n\t{}".format(
|
281 |
+
"\n\t".join(column_names_arr)
|
282 |
+
)
|
283 |
+
)
|
284 |
+
|
285 |
+
return n_rows, n_columns, column_names_arr
|
286 |
+
|
287 |
+
def _process_text(self, input_text: str) -> str:
|
288 |
+
"""
|
289 |
+
Method for applying NLP-based techniques on an input text in order
|
290 |
+
to prepare it to be used by the embedding algorithm.
|
291 |
+
|
292 |
+
Parameters
|
293 |
+
-----------
|
294 |
+
input_text : str
|
295 |
+
Variable corresponding to the input text.
|
296 |
+
|
297 |
+
Returns
|
298 |
+
-----------
|
299 |
+
processed_text : str
|
300 |
+
Processed version of the ``input_text``.
|
301 |
+
|
302 |
+
Notes
|
303 |
+
----------
|
304 |
+
This function will perform the following NLP-based techniques:
|
305 |
+
|
306 |
+
1. Make the text lowercase.
|
307 |
+
2. Remove any non-alphanumeric character from the string.
|
308 |
+
3. Remove any stop words from the text.
|
309 |
+
"""
|
310 |
+
|
311 |
+
return self.nlp_obj.process_text(input_string=input_text)
|
312 |
+
|
313 |
+
def clean_dataset(self) -> pd.DataFrame:
|
314 |
+
"""
|
315 |
+
Method for cleaning the raw dataset and create a clean version
|
316 |
+
of the dataset.
|
317 |
+
|
318 |
+
Returns
|
319 |
+
---------
|
320 |
+
dataset_clean : pandas.DataFrame
|
321 |
+
Clean version of the input dataset, after having gone through
|
322 |
+
data-cleaning techniques.
|
323 |
+
"""
|
324 |
+
# --- Start time
|
325 |
+
logger.info(">> Data cleaning process ...")
|
326 |
+
start_time = datetime.now()
|
327 |
+
#
|
328 |
+
|
329 |
+
# --- Making a copy of the raw dataset
|
330 |
+
dataset_df = self.raw_dataset.copy()
|
331 |
+
|
332 |
+
# --- Data-cleaning techniques
|
333 |
+
# Removing duplicates
|
334 |
+
dataset_df.drop_duplicates(keep="first", inplace=True)
|
335 |
+
|
336 |
+
# Removing entries that have 'NaN' in the dataset
|
337 |
+
dataset_df.dropna(how="any", inplace=True)
|
338 |
+
|
339 |
+
# Casting proper data types
|
340 |
+
dataset_df = dataset_df.astype(str)
|
341 |
+
|
342 |
+
# Resetting the index of the dataset
|
343 |
+
dataset_df.reset_index(drop=True, inplace=True)
|
344 |
+
|
345 |
+
# Removing trailing whitespaces
|
346 |
+
for colname in [self.document_id_colname, self.title_colname]:
|
347 |
+
dataset_df.loc[:, colname] = dataset_df[colname].apply(
|
348 |
+
lambda x: x.strip()
|
349 |
+
)
|
350 |
+
|
351 |
+
# Processing content
|
352 |
+
dataset_df.loc[:, getattr(self, "clean_content_colname")] = dataset_df[
|
353 |
+
getattr(self, "content_colname")
|
354 |
+
].apply(lambda text: self.nlp_obj.process_text(text))
|
355 |
+
|
356 |
+
# --- Saving to disk, if applicable
|
357 |
+
if self.save_to_disk:
|
358 |
+
dataset_filepath = self.datasets_dir.joinpath("clean_dataset.csv")
|
359 |
+
dataset_filepath.parent.mkdir(exist_ok=True, parents=True)
|
360 |
+
dataset_df.to_csv(dataset_filepath, header=True, index=True)
|
361 |
+
|
362 |
+
logger.info(f">> Clean dataset saved to '{str(dataset_filepath)}'")
|
363 |
+
|
364 |
+
# --- End time
|
365 |
+
end_time = datetime.now()
|
366 |
+
logger.info(f">> Finished at: {end_time}")
|
367 |
+
logger.info(f">> Took: {end_time - start_time}")
|
368 |
+
logger.info(">> Data cleaning process ... DONE")
|
369 |
+
|
370 |
+
return dataset_df
|
371 |
+
|
372 |
+
def push_dataset_to_hub(
|
373 |
+
self,
|
374 |
+
dataset: pd.DataFrame,
|
375 |
+
dataset_name: str,
|
376 |
+
username: Optional[Union[None, str]] = None,
|
377 |
+
):
|
378 |
+
"""
|
379 |
+
Method for pushing the ``dataset`` to the HuggingFace's Hub.
|
380 |
+
|
381 |
+
Parameters
|
382 |
+
-------------
|
383 |
+
dataset : pandas.DataFrame
|
384 |
+
Dataset that will be pushed to HuggingFace.
|
385 |
+
|
386 |
+
dataset_name : str
|
387 |
+
Name of the dataset to use.
|
388 |
+
|
389 |
+
username : str, NoneType, optional
|
390 |
+
Us
|
391 |
+
"""
|
392 |
+
# Initializing class object
|
393 |
+
hf_obj = hf.HuggingFaceHelper()
|
394 |
+
|
395 |
+
# Transforming dataset type
|
396 |
+
hf_dataset = hf_obj.convert_dataframe_to_dataset(input_df=dataset)
|
397 |
+
|
398 |
+
# Push dataset to hub
|
399 |
+
hf_obj.push_dataset(
|
400 |
+
dataset=hf_dataset,
|
401 |
+
dataset_name=dataset_name,
|
402 |
+
username=username,
|
403 |
+
)
|
src/classes/hugging_face_utils.py
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
"""
|
24 |
+
Module that includes utilities for interacting with HuggingFace
|
25 |
+
"""
|
26 |
+
|
27 |
+
import logging
|
28 |
+
import os
|
29 |
+
from typing import Dict, Optional, Union
|
30 |
+
|
31 |
+
import pandas as pd
|
32 |
+
from datasets import Dataset, load_dataset
|
33 |
+
from huggingface_hub import HfApi
|
34 |
+
|
35 |
+
from src.utils import default_variables as dv
|
36 |
+
|
37 |
+
__all__ = ["HuggingFaceHelper"]
|
38 |
+
|
39 |
+
|
40 |
+
logger = logging.getLogger(__name__)
|
41 |
+
logging.basicConfig(level=logging.INFO)
|
42 |
+
logger.setLevel(level=logging.INFO)
|
43 |
+
|
44 |
+
|
45 |
+
class HuggingFaceHelper(object):
|
46 |
+
"""
|
47 |
+
Class definition for creating, interacting, and sharing Datasets.
|
48 |
+
"""
|
49 |
+
|
50 |
+
def __init__(self, **kwargs: Dict) -> None:
|
51 |
+
"""
|
52 |
+
Class definition for creating, interacting, and sharing Datasets.
|
53 |
+
"""
|
54 |
+
# Name of the HuggingFace token as stored in the user's environment
|
55 |
+
self.token_name = kwargs.get("token_name", dv.hugging_face_token_name)
|
56 |
+
self.username = kwargs.get(
|
57 |
+
"username",
|
58 |
+
os.environ.get(dv.hugging_face_username_name),
|
59 |
+
)
|
60 |
+
|
61 |
+
# HuggingFace endpoint
|
62 |
+
self.api_endpoint = "https://huggingface.co"
|
63 |
+
self.api = self._authenticate_api()
|
64 |
+
|
65 |
+
def _authenticate_api(self) -> HfApi:
|
66 |
+
"""
|
67 |
+
Method for authenticating with HuggingFace using an authentication
|
68 |
+
token.
|
69 |
+
|
70 |
+
Returns
|
71 |
+
---------
|
72 |
+
huggingface_api : huggingface_hub.hf_api.HfApi
|
73 |
+
Object corresponding to the HuggingFace API after authentication.
|
74 |
+
"""
|
75 |
+
# Check that token is part of the user's environment
|
76 |
+
if not os.environ.get(self.token_name):
|
77 |
+
msg = f">>> HuggingFace API Token '{self.token_name}' not defined!"
|
78 |
+
logger.error(msg)
|
79 |
+
raise ValueError(msg)
|
80 |
+
|
81 |
+
# Initializing API object
|
82 |
+
return HfApi(
|
83 |
+
endpoint=self.api_endpoint,
|
84 |
+
token=os.environ.get(self.token_name),
|
85 |
+
)
|
86 |
+
|
87 |
+
def convert_dataframe_to_dataset(
|
88 |
+
self,
|
89 |
+
input_df: pd.DataFrame,
|
90 |
+
) -> Dataset:
|
91 |
+
"""
|
92 |
+
Function to convert an existing DataFrame into a ``Dataset`` object
|
93 |
+
|
94 |
+
Parameters
|
95 |
+
-------------
|
96 |
+
input_df : pandas.DataFrame
|
97 |
+
Variable corresponding to the DataFrame to convert.
|
98 |
+
|
99 |
+
Returns
|
100 |
+
-----------
|
101 |
+
dataset_obj : datasets.Dataset
|
102 |
+
Dataset object with the same data as ``input_df``.
|
103 |
+
"""
|
104 |
+
|
105 |
+
return Dataset.from_pandas(df=input_df)
|
106 |
+
|
107 |
+
def get_dataset_from_hub(
|
108 |
+
self,
|
109 |
+
dataset_name: str,
|
110 |
+
username: Optional[Union[None, str]] = None,
|
111 |
+
split: Optional[Union[None, str]] = None,
|
112 |
+
) -> Dataset:
|
113 |
+
# sourcery skip: extract-duplicate-method, use-fstring-for-formatting
|
114 |
+
"""
|
115 |
+
Method for extracting the Dataset from HuggingFace.
|
116 |
+
|
117 |
+
Parameters
|
118 |
+
------------
|
119 |
+
dataset_name : str
|
120 |
+
Name of the dataset to extract from HuggingFace's Hub.
|
121 |
+
|
122 |
+
username : str, NoneType, optional
|
123 |
+
Username to use when extracting the dataset from HuggingFace Hub.
|
124 |
+
This variable is set to ``None`` by default.
|
125 |
+
|
126 |
+
split : str, NoneType, optional
|
127 |
+
Type of ``split`` to load for the Dataset. If ``None``, the
|
128 |
+
method will extract all splits. This variable is set to
|
129 |
+
``None`` by default.
|
130 |
+
|
131 |
+
Returns
|
132 |
+
--------
|
133 |
+
dataset_obj : datasets.Dataset
|
134 |
+
Variable corresponding to the dataset that was extracted
|
135 |
+
from the HuggingFace Hub.
|
136 |
+
"""
|
137 |
+
# 'dataset_name' - Type
|
138 |
+
dataset_name_type_arr = (str,)
|
139 |
+
if not isinstance(dataset_name, dataset_name_type_arr):
|
140 |
+
msg = (
|
141 |
+
">> 'dataset_name' ({}) is not a valid input type ({})".format(
|
142 |
+
type(dataset_name),
|
143 |
+
dataset_name_type_arr,
|
144 |
+
)
|
145 |
+
)
|
146 |
+
logger.error(msg)
|
147 |
+
raise TypeError(msg)
|
148 |
+
# 'username' - Type
|
149 |
+
username_type_arr = (str, type(None))
|
150 |
+
if not isinstance(username, username_type_arr):
|
151 |
+
msg = ">> 'username' ({}) is not a valid input type ({})".format(
|
152 |
+
type(username),
|
153 |
+
username_type_arr,
|
154 |
+
)
|
155 |
+
logger.error(msg)
|
156 |
+
raise TypeError(msg)
|
157 |
+
# 'split' - Type
|
158 |
+
split_type_arr = (str, type(None))
|
159 |
+
if not isinstance(split, split_type_arr):
|
160 |
+
msg = ">> 'split' ({}) is not a valid input type ({})".format(
|
161 |
+
type(split),
|
162 |
+
split_type_arr,
|
163 |
+
)
|
164 |
+
logger.error(msg)
|
165 |
+
raise TypeError(msg)
|
166 |
+
|
167 |
+
# Defining the path to the dataset in HF.
|
168 |
+
dataset_path = (
|
169 |
+
f"{username}/{dataset_name}" if username else dataset_name
|
170 |
+
)
|
171 |
+
|
172 |
+
return load_dataset(dataset_path, split=split)
|
173 |
+
|
174 |
+
def push_dataset(
|
175 |
+
self,
|
176 |
+
dataset: Dataset,
|
177 |
+
dataset_name: str,
|
178 |
+
username: Optional[Union[None, str]] = None,
|
179 |
+
): # sourcery skip: extract-duplicate-method, use-fstring-for-formatting
|
180 |
+
"""
|
181 |
+
Method for pushing an existing local Dataset to HuggingFace.
|
182 |
+
"""
|
183 |
+
# --- Check input type
|
184 |
+
# 'dataset' - Type
|
185 |
+
dataset_type_arr = (Dataset,)
|
186 |
+
if not isinstance(dataset, dataset_type_arr):
|
187 |
+
msg = ">> 'dataset' ({}) is not a valid input type ({})".format(
|
188 |
+
type(dataset),
|
189 |
+
dataset_type_arr,
|
190 |
+
)
|
191 |
+
logger.error(msg)
|
192 |
+
raise TypeError(msg)
|
193 |
+
# 'dataset_name' - Type
|
194 |
+
dataset_name_type_arr = (str,)
|
195 |
+
if not isinstance(dataset_name, dataset_name_type_arr):
|
196 |
+
msg = (
|
197 |
+
">> 'dataset_name' ({}) is not a valid input type ({})".format(
|
198 |
+
type(dataset_name),
|
199 |
+
dataset_name_type_arr,
|
200 |
+
)
|
201 |
+
)
|
202 |
+
logger.error(msg)
|
203 |
+
raise TypeError(msg)
|
204 |
+
# 'username' - Type
|
205 |
+
username_type_arr = (str, type(None))
|
206 |
+
if not isinstance(username, username_type_arr):
|
207 |
+
msg = ">> 'username' ({}) is not a valid input type ({})".format(
|
208 |
+
type(username),
|
209 |
+
username_type_arr,
|
210 |
+
)
|
211 |
+
logger.error(msg)
|
212 |
+
raise TypeError(msg)
|
213 |
+
|
214 |
+
# Defining the path to the dataset in HF.
|
215 |
+
dataset_path = (
|
216 |
+
f"{username}/{dataset_name}" if username else dataset_name
|
217 |
+
)
|
218 |
+
|
219 |
+
# Pushing dataset to HuggingFace
|
220 |
+
dataset.push_to_hub(
|
221 |
+
repo_id=dataset_path,
|
222 |
+
token=os.environ.get(self.token_name),
|
223 |
+
)
|
src/classes/semantic_search_engine.py
ADDED
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
import logging
|
24 |
+
from typing import Dict, Optional
|
25 |
+
|
26 |
+
import numpy as np
|
27 |
+
import pandas as pd
|
28 |
+
import torch
|
29 |
+
from datasets import Dataset
|
30 |
+
from sentence_transformers import SentenceTransformer
|
31 |
+
|
32 |
+
from src.utils import default_variables as dv
|
33 |
+
|
34 |
+
__author__ = ["Victor Calderon"]
|
35 |
+
__copyright__ = ["Copyright 2023 Victor Calderon"]
|
36 |
+
__all__ = ["SemanticSearchEngine"]
|
37 |
+
|
38 |
+
logger = logging.getLogger(__name__)
|
39 |
+
logging.basicConfig(
|
40 |
+
level=logging.INFO,
|
41 |
+
format="%(asctime)s [%(levelname)s]: %(message)s",
|
42 |
+
)
|
43 |
+
logger.setLevel(logging.INFO)
|
44 |
+
|
45 |
+
# --------------------------- CLASS DEFINITIONS -------------------------------
|
46 |
+
|
47 |
+
|
48 |
+
class SemanticSearchEngine(object):
|
49 |
+
"""
|
50 |
+
Class object for running Semantic Search on the input dataset.
|
51 |
+
"""
|
52 |
+
|
53 |
+
def __init__(self, **kwargs):
|
54 |
+
"""
|
55 |
+
Class object for running Semantic Search on the input dataset.
|
56 |
+
"""
|
57 |
+
# --- Defining variables
|
58 |
+
# Device to use, i.e. CPU or GPU
|
59 |
+
self.device = self._get_device()
|
60 |
+
# Embedder model to use
|
61 |
+
self.model = "paraphrase-mpnet-base-v2"
|
62 |
+
# Defining the embedder
|
63 |
+
self.embedder = self._get_embedder()
|
64 |
+
|
65 |
+
# Corpus embeddings
|
66 |
+
self.source_colname = kwargs.get(
|
67 |
+
"source_colname",
|
68 |
+
"summary",
|
69 |
+
)
|
70 |
+
self.embeddings_colname = kwargs.get(
|
71 |
+
"embeddings_colname",
|
72 |
+
dv.embeddings_colname,
|
73 |
+
)
|
74 |
+
|
75 |
+
# Variables used for running semantic search
|
76 |
+
self.corpus_dataset_with_faiss_index = kwargs.get(
|
77 |
+
"corpus_dataset_with_faiss_index"
|
78 |
+
)
|
79 |
+
|
80 |
+
def _get_device(self) -> str:
|
81 |
+
"""
|
82 |
+
Method for determining the device to use.
|
83 |
+
|
84 |
+
Returns
|
85 |
+
----------
|
86 |
+
device_type : str
|
87 |
+
Type of device to use (e.g. 'cpu' or 'cuda').
|
88 |
+
|
89 |
+
Options:
|
90 |
+
- ``cpu`` : Uses a CPU.
|
91 |
+
- ``cuda`` : Uses a GPU.
|
92 |
+
"""
|
93 |
+
# Determining the type of device to use
|
94 |
+
device_type = "cuda" if torch.cuda.is_available() else "cpu"
|
95 |
+
|
96 |
+
logger.info(f">> Running on a '{device_type.upper()}' device")
|
97 |
+
|
98 |
+
return device_type
|
99 |
+
|
100 |
+
def _get_embedder(self):
|
101 |
+
"""
|
102 |
+
Method for extracting the Embedder model.
|
103 |
+
|
104 |
+
Returns
|
105 |
+
---------
|
106 |
+
embedder : model
|
107 |
+
Variable corresponding to the Embeddings models.
|
108 |
+
"""
|
109 |
+
embedder = SentenceTransformer(self.model)
|
110 |
+
embedder.to(self.device)
|
111 |
+
|
112 |
+
return embedder
|
113 |
+
|
114 |
+
def generate_corpus_index_and_embeddings(
|
115 |
+
self,
|
116 |
+
corpus_dataset: Dataset,
|
117 |
+
) -> Dataset:
|
118 |
+
"""
|
119 |
+
Method for generating the Text Embeddings and FAISS indices from
|
120 |
+
the input dataset.
|
121 |
+
|
122 |
+
Parameters
|
123 |
+
------------
|
124 |
+
corpus_dataset : datasets.Dataset
|
125 |
+
Dataset containing the text to use to create the text
|
126 |
+
embeddings and FAISS indices.
|
127 |
+
|
128 |
+
Returns
|
129 |
+
----------
|
130 |
+
corpus_dataset_with_embeddings : datasets.Dataset
|
131 |
+
Dataset containing the original data rom ``corpus_dataset``
|
132 |
+
plus the corresponding text embeddings of the ``source_colname``
|
133 |
+
column.
|
134 |
+
"""
|
135 |
+
torch.set_grad_enabled(False)
|
136 |
+
|
137 |
+
# --- Generate text embeddings for the source column
|
138 |
+
corpus_dataset_with_embeddings = corpus_dataset.map(
|
139 |
+
lambda corpus: {
|
140 |
+
self.embeddings_colname: self.embedder.encode(
|
141 |
+
corpus[self.source_colname]
|
142 |
+
)
|
143 |
+
},
|
144 |
+
batched=True,
|
145 |
+
desc="Computing Semantic Search Embeddings",
|
146 |
+
)
|
147 |
+
|
148 |
+
# --- Adding FAISS index
|
149 |
+
corpus_dataset_with_embeddings.add_faiss_index(
|
150 |
+
column=self.embeddings_colname,
|
151 |
+
faiss_verbose=True,
|
152 |
+
device=None if self.device == "cpu" else 1,
|
153 |
+
)
|
154 |
+
|
155 |
+
return corpus_dataset_with_embeddings
|
156 |
+
|
157 |
+
def run_semantic_search(
|
158 |
+
self,
|
159 |
+
query: str,
|
160 |
+
top_n: Optional[int] = 5,
|
161 |
+
) -> Dict: # sourcery skip: extract-duplicate-method
|
162 |
+
"""
|
163 |
+
Method for running a semantic search on a query after having
|
164 |
+
created the corpus of the text embeddings.
|
165 |
+
|
166 |
+
Parameters
|
167 |
+
--------------
|
168 |
+
query : str
|
169 |
+
Text query to use for searching the database.
|
170 |
+
|
171 |
+
top_n : int, optional
|
172 |
+
Variable corresponding to the 'Top N' values to return based on the
|
173 |
+
similarity score between the input query and the corpus. This
|
174 |
+
variable is set to ``10`` by default.
|
175 |
+
|
176 |
+
Returns
|
177 |
+
---------
|
178 |
+
match_results : dict
|
179 |
+
Dictionary containing the metadata of each of the articles
|
180 |
+
that were in the Top-N in terms of being most similar to the
|
181 |
+
input query ``query``.
|
182 |
+
"""
|
183 |
+
# --- Checking input parameters
|
184 |
+
# 'query' - Type
|
185 |
+
query_type_arr = (str,)
|
186 |
+
if not isinstance(query, query_type_arr):
|
187 |
+
msg = ">> 'query' ({}) is not a valid input type ({})".format(
|
188 |
+
type(query), query_type_arr
|
189 |
+
)
|
190 |
+
logger.error(msg)
|
191 |
+
raise TypeError(msg)
|
192 |
+
# 'top_n' - Type
|
193 |
+
top_n_type_arr = (int,)
|
194 |
+
if not isinstance(top_n, top_n_type_arr):
|
195 |
+
msg = ">> 'top_n' ({}) is not a valid input type ({})".format(
|
196 |
+
type(top_n), top_n_type_arr
|
197 |
+
)
|
198 |
+
logger.error(msg)
|
199 |
+
raise TypeError(msg)
|
200 |
+
|
201 |
+
# 'top_n' - Value
|
202 |
+
if top_n <= 0:
|
203 |
+
msg = f">> 'top_n' ({top_n}) must be larger than '0'!"
|
204 |
+
logger.error(msg)
|
205 |
+
raise ValueError(msg)
|
206 |
+
|
207 |
+
# --- Checking that the encoder has been indexed correctly
|
208 |
+
if self.corpus_dataset_with_faiss_index is None:
|
209 |
+
msg = ">>> The FAISS index was not properly set!"
|
210 |
+
logger.error(msg)
|
211 |
+
raise ValueError(msg)
|
212 |
+
|
213 |
+
# --- Encode the input query and extract the embedding
|
214 |
+
query_embedding = self.embedder.encode(query)
|
215 |
+
|
216 |
+
# --- Extracting the top-N results
|
217 |
+
(
|
218 |
+
scores,
|
219 |
+
results,
|
220 |
+
) = self.corpus_dataset_with_faiss_index.get_nearest_examples(
|
221 |
+
self.embeddings_colname,
|
222 |
+
query_embedding,
|
223 |
+
k=top_n,
|
224 |
+
)
|
225 |
+
|
226 |
+
# --- Sorting from highest to lowest
|
227 |
+
# NOTE: We need to deconstruct the 'results' to be able to organize
|
228 |
+
# the results
|
229 |
+
parsed_results = pd.DataFrame.from_dict(
|
230 |
+
data=results,
|
231 |
+
orient="columns",
|
232 |
+
)
|
233 |
+
parsed_results.loc[:, "relevance"] = scores
|
234 |
+
|
235 |
+
# Sorting in descending order
|
236 |
+
parsed_results = parsed_results.sort_values(
|
237 |
+
by=["relevance"],
|
238 |
+
ascending=False,
|
239 |
+
).reset_index(drop=True)
|
240 |
+
|
241 |
+
# Casting data type for the 'relevance'
|
242 |
+
parsed_results.loc[:, "relevance"] = parsed_results["relevance"].apply(
|
243 |
+
lambda x: str(np.round(x, 5))
|
244 |
+
)
|
245 |
+
|
246 |
+
# Only keeping certain columns
|
247 |
+
columns_to_keep = ["_id", "title", "relevance", "content"]
|
248 |
+
|
249 |
+
return parsed_results[columns_to_keep].to_dict(orient="index")
|
src/data_processing/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
src/data_processing/prepare_dataset.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
"""
|
24 |
+
Module for preparing the input dataset.
|
25 |
+
"""
|
26 |
+
|
27 |
+
import logging
|
28 |
+
from pathlib import Path
|
29 |
+
from typing import Dict
|
30 |
+
|
31 |
+
from src.classes import data_preparation as dp
|
32 |
+
from src.utils import default_variables as dv
|
33 |
+
from src.utils import general_utilities as gu
|
34 |
+
|
35 |
+
__author__ = ["Victor Calderon"]
|
36 |
+
__copyright__ = ["Copyright 2023 Victor Calderon"]
|
37 |
+
__all__ = []
|
38 |
+
|
39 |
+
logger = logging.getLogger(__name__)
|
40 |
+
logging.basicConfig(
|
41 |
+
level=logging.INFO,
|
42 |
+
format="%(asctime)s [%(levelname)s]: %(message)s",
|
43 |
+
)
|
44 |
+
logger.setLevel(logging.INFO)
|
45 |
+
|
46 |
+
|
47 |
+
# ---------------------------- PROJECT VARIABLES ------------------------------
|
48 |
+
|
49 |
+
MODULE_DESCRIPTION = "Module for data preparation"
|
50 |
+
MODULE_VERSION = "1.0"
|
51 |
+
|
52 |
+
|
53 |
+
# ----------------------------- INPUT PARAMETERS ------------------------------
|
54 |
+
|
55 |
+
|
56 |
+
def get_parser():
|
57 |
+
"""
|
58 |
+
Function to get the input parameters to the script.
|
59 |
+
"""
|
60 |
+
# Defining the 'parser' object to use
|
61 |
+
parser = gu._get_parser_obj(description=MODULE_DESCRIPTION)
|
62 |
+
|
63 |
+
# Path to the input dataset
|
64 |
+
parser.add_argument(
|
65 |
+
"--dataset-path",
|
66 |
+
dest="dataset_path",
|
67 |
+
default=dv.cicero_dataset_url,
|
68 |
+
type=str,
|
69 |
+
help="""
|
70 |
+
Path / URL to the input dataset.
|
71 |
+
[Default: '%(default)s']
|
72 |
+
""",
|
73 |
+
)
|
74 |
+
|
75 |
+
return parser.parse_args()
|
76 |
+
|
77 |
+
|
78 |
+
# ------------------------------- FUNCTIONS ----------------------------------
|
79 |
+
|
80 |
+
|
81 |
+
def _resolve_input_object_path(object_path: str) -> str:
|
82 |
+
"""
|
83 |
+
Check whether or not the path corresponds to a local file or a URL.
|
84 |
+
|
85 |
+
Parameters
|
86 |
+
-------------
|
87 |
+
object_path : str
|
88 |
+
Path of the input object.
|
89 |
+
|
90 |
+
Returns
|
91 |
+
----------
|
92 |
+
parsed_object_path : str
|
93 |
+
Modified / parsed version of the input object ``object_path``.
|
94 |
+
|
95 |
+
Raises
|
96 |
+
------------
|
97 |
+
TypeError ; Error
|
98 |
+
This error gets raised whenever the input object is neither
|
99 |
+
a 'file' nor a valid 'url'.
|
100 |
+
"""
|
101 |
+
object_type = gu.check_url_or_file_type(object_path=object_path)
|
102 |
+
|
103 |
+
if object_type == "unspecified":
|
104 |
+
msg = (
|
105 |
+
f">>> Unspecified data type for '{object_path}' or does not exist"
|
106 |
+
)
|
107 |
+
logger.error(msg)
|
108 |
+
raise TypeError(msg)
|
109 |
+
|
110 |
+
return (
|
111 |
+
object_path
|
112 |
+
if object_type == "url"
|
113 |
+
else str(Path(object_path).resolve())
|
114 |
+
)
|
115 |
+
|
116 |
+
|
117 |
+
def _temp_create_dataset_with_summaries():
|
118 |
+
"""
|
119 |
+
Function to **temporarily** create the Dataset object in HuggingFace
|
120 |
+
using the dataset with summaries for each of the articles.
|
121 |
+
|
122 |
+
Notes
|
123 |
+
--------
|
124 |
+
This is a temporary solution UNTIL the ``Summarizer`` is put in place.
|
125 |
+
"""
|
126 |
+
# Path to the dataset
|
127 |
+
dataset_filepath = str(
|
128 |
+
(
|
129 |
+
gu.get_project_paths()
|
130 |
+
.get("src")
|
131 |
+
.joinpath(
|
132 |
+
"utils",
|
133 |
+
"gpt35_summaries",
|
134 |
+
"df_embed_out2.csv",
|
135 |
+
)
|
136 |
+
).resolve()
|
137 |
+
)
|
138 |
+
|
139 |
+
# Reading in dataset
|
140 |
+
data_prep_obj = dp.DatasetPrep(dataset_path=dataset_filepath)
|
141 |
+
|
142 |
+
# Uploading it to HuggingFace Hub
|
143 |
+
data_prep_obj.push_dataset_to_hub(
|
144 |
+
dataset=data_prep_obj.raw_dataset,
|
145 |
+
dataset_name=dv.summaries_dataset_name,
|
146 |
+
)
|
147 |
+
|
148 |
+
return
|
149 |
+
|
150 |
+
|
151 |
+
# ------------------------------ MAIN FUNCTIONS -------------------------------
|
152 |
+
|
153 |
+
|
154 |
+
def main(params_dict: Dict):
|
155 |
+
"""
|
156 |
+
Main function to process the data.
|
157 |
+
"""
|
158 |
+
# Determine if the path corresponds to a file or a URL
|
159 |
+
params_dict["object_path"] = _resolve_input_object_path(
|
160 |
+
params_dict["dataset_path"]
|
161 |
+
)
|
162 |
+
|
163 |
+
# Showing set of input parameters
|
164 |
+
gu.show_params(params_dict=params_dict, logger=logger)
|
165 |
+
|
166 |
+
# Initializing input parameters
|
167 |
+
data_prep_obj = dp.DatasetPrep(dataset_path=params_dict["object_path"])
|
168 |
+
data_prep_obj.show_params()
|
169 |
+
clean_dataset = data_prep_obj.clean_dataset()
|
170 |
+
|
171 |
+
logger.info(f"\n>>> Raw dataset: \n{data_prep_obj.raw_dataset}\n")
|
172 |
+
logger.info(f"\n>>> Clean dataset: \n{clean_dataset}\n")
|
173 |
+
|
174 |
+
# --- Pushing datasets to HuggingFace Hub
|
175 |
+
# 'Raw' dataset
|
176 |
+
data_prep_obj.push_dataset_to_hub(
|
177 |
+
dataset=data_prep_obj.raw_dataset,
|
178 |
+
dataset_name=dv.raw_dataset_name,
|
179 |
+
)
|
180 |
+
# 'Clean' dataset
|
181 |
+
data_prep_obj.push_dataset_to_hub(
|
182 |
+
dataset=clean_dataset,
|
183 |
+
dataset_name=dv.clean_dataset_name,
|
184 |
+
)
|
185 |
+
|
186 |
+
# Dataset with summaries
|
187 |
+
_temp_create_dataset_with_summaries()
|
188 |
+
|
189 |
+
return
|
190 |
+
|
191 |
+
|
192 |
+
if __name__ == "__main__":
|
193 |
+
# Getting input parameters
|
194 |
+
params_dict = vars(get_parser())
|
195 |
+
# Running main function
|
196 |
+
main(params_dict=params_dict)
|
src/focused_summary_example.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.gpt35_summaries.summarizer import Summarizer
|
2 |
+
|
3 |
+
s = Summarizer()
|
4 |
+
article_text = """
|
5 |
+
div g alt rchttpswwwaeiorgwpcontentuploads202010thatcherpngx91208 iv pa relnoreferrer noopener efhttpsenwikipediaorgwikimargaret_thatcher margaret hilda pictured above was born on this day october 13 in 1925 and today would have been her birthday unfortunately she died on april 8 2013 at the age of 87 to honor baroness thatcher on her birthday here is my annual tribute to the iron lady in recognition of her significant contributions during her political career including serving as the prime minister of the k from 1979 to 1990 below are some videos quotations and related articles to celebrate prime minister margaret thatchers birthday and her rich legacy defending liberty and freedom and fighting socialism p llowfullscreen width640 height350iframecenter 1 the video above features margaret thatchers address to the conservative party conference in 1983 when she classwpblockquoteplet us never forget this fundamental truth the state has no source of money other than the money people earn themselves if the state wishes to spend more it can do so only by borrowing your savings or by taxing you more ere is no such thing as public money there is only taxpayers te m a very fundamental truth that is frequently forgotten any time you see or hear the terms public funding public funds government funding or government funds be sure to substitute taxpayer funding and taxpayer p 2 here are the five reasons margaret thatcher is still an inspiration to women trong via the a relnoreferrer noopener efhttpiwforgblog2804951topfivereasonsmargaretthatcherisstillaninspirationtowomentoday ndependent on margaret thatchers birthday in 2017 summarized he didnt use her sex to influence her he was he challenged the status he had to work for her he was a modern pp llowfullscreen rchttpswwwyoutubecomembedrv5t6rc6yvg width640 height350iframecenter p 3 the video above is margaret thatchers last house of commons speech on november 22 1990 which is known as thatchers last stand against socialism heres the a relnoreferrer noopener efhttpswwwmargaretthatcherorgdocument108256 full and heres a think that the hon gentleman knows that i have the same contempt for his socialist policies as the people of east europe who have experienced them have for theirs i think that i must have hit the right nail on the head when i pointed out that the logic of those policies is that they would rather the poor were poorer once they start to talk about the gap they would rather that the gap were tem[indicating[emdown here not [indicating[embutem[indicatingem] so long as the gap is smaller they would rather have the poor poorer one does not create wealth and opportunity that way one does not create a ropertyowning democracy that way 4 here are 10 great margaret thatcher quotes which are just as relevant and timely for america today if not more so than they were for the k more than a ercentury ago listen up marx oc thanks to larry reed for a relnoreferrer noopener efhttpswwwfeeorgarticlesmargaretthatcheronsocialism20ofherbestquotesfbclidiwar0ypr1qt8cco_rft4xqrr33_ebujx0aymff6mfhh149d_1uwtiyagjoblk ome of these classwpblockquotep1 the problem with socialism is that you eventually run out of other peoples moneypp2 do you know that one of the great problems of our age is that we are governed by people who care more about feelings than they do about thoughts and deaspp3 i think weve been through a period where too many people have been given to understand that if they have a problem its the governments job to cope with it i have a problem ill get a grant im homeless the government must house me theyre casting their problem on society and you know there is no such thing as society there are individual men and women and there are families and no government can do anything except through people and people must look to themselves first its our duty to look after ourselves and then also to look after our neighbor people have got the too much in mind without the obligations theres no such thing as entitlement unless someone has first met an obligationpp4 no one would remember the good samaritan if hed only had good intentions he had money as wellpp5 the philosophical reason for which we are against nationalization and for private enterprise is because we believe that economic progress comes from the inventiveness ability determination and the pioneering spirit of extraordinary men and women if they cannot exercise that spirit here they will go away to another free enterprise country which will then make more economic progress than we do we ought in fact to be encouraging small firms and small companies because the extent to which innovation comes through these companies is endouspp6 our challenge is to create the kind of economic background which enables private initiative and private enterprise to flourish for the benefit of the consumer employee the pensioner and society as a believe we should judge people on merit and not on background i believe the person who is prepared to work hardest should get the greatest rewards and keep them after tax that we should back the workers and not the shirkers that it is not only permissible but praiseworthy to want to benefit your own family by your own effortspp7 i place a profound eliefindeed a fervent faithin the virtues of elfreliance and personal independence on these is founded the whole case for the free society for the assertion that human progress is best achieved by offering the freest possible scope for the development of individual talents qualified only by a respect for the qualities and the freedom of othersfor many years there has been a subtle erosion of the essential virtues of the free society elfreliance has been sneered at as if it were an absurd suburban pretention thrift has been denigrated as if it were greed the desire of parents to choose and to struggle for what they themselves regarded as the best possible education for their children has been cornedpp8 what are the lessons then that weve learned from the last thirty years first that the pursuit of equality itself is a mirage whats more desirable and more practicable than the pursuit of equality is the pursuit of equality of opportunity and opportunity means nothing unless it includes the right to be unequal and the freedom to be different one of the reasons that we value individuals is not because theyre all the same but because theyre all different i believe you have a saying in the middle west dont cut down the tall poppies let them rather grow tall i would say let our children grow tall and some taller than others if they have the ability in them to do so because we must build a society in which each citizen can develop his full potential both for his own benefit and for the community as a whole a society in which originality skill energy and thrift are rewarded in which we encourage rather than restrict the variety and richness of human naturepp9 some socialists seem to believe that people should be numbers in a state computer we believe they should be individuals we are all unequal no one thank heavens is like anyone else however much the socialists may pretend otherwise we believe that everyone has the right to be unequal but to us every human being is equally mportantpp10 there is no such thing as safe socialism if its safe its not socialism and if its socialism its not safe the signposts of socialism point downhill to less freedom less prosperity downhill to more muddle more failure if we follow them to their destination they will lead this nation into lockquote happy birthday margaret thatcher p e post a efhttpswwwaeiorgcarpediemhappy96thbirthdaymargaretthatcherhappy birthday margaret appeared first on a efhttpswwwaeiorgamerican enterprise institute
|
6 |
+
"""
|
7 |
+
search_string = "most significant contribution of thatcher"
|
8 |
+
|
9 |
+
print("Focussed summary for search string", f'"{search_string}":')
|
10 |
+
print(s._run_model(article_text, search_string))
|
11 |
+
|
12 |
+
print("Vanilla summary:")
|
13 |
+
print(s._run_model(article_text))
|
14 |
+
|
15 |
+
# Example output:s
|
16 |
+
# Focussed summary for search string "most significant contribution of thatcher":
|
17 |
+
# In commemoration of former British Prime Minister Margaret Thatcher's birthday, which falls on October 13, American Enterprise Institute pays tribute to her political career and significant contributions, including serving as the Prime Minister of the UK from 1979 to 1990. The article features various videos, quotations, and articles that celebrate Thatcher's rich legacy of defending freedom, liberty, and fighting socialism. The article lists several of Thatcher's speeches, quotes, and her last stand against socialism.
|
18 |
+
# Overall, the article does not explicitly state Thatcher's most significant contribution, but it heavily implies that Thatcher's significant contributions were her defense of liberty and freedom and her fight against socialism.
|
19 |
+
# Vanilla summary:
|
20 |
+
# To commemorate Margaret Thatcher's birthday, born on October 13 in 1925, AEI presents an annual tribute to the Iron Lady. Thatcher's contribution during her political career and her rich legacy defending liberty and freedom and fighting socialism are celebrated. The article provides several videos, quotations, and related articles to commemorate the Prime Minister's birthday and her lasting influence as a female leader in the UK. Among Thatcher's inspirational quotes, she advocated for private enterprise as a means of economic progress, and emphasized the importance of individual responsibility, merit, and personal independence in building a free society.
|
src/training/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
src/training/create_faiss_corpus_index.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
import logging
|
24 |
+
from pathlib import Path
|
25 |
+
from typing import Dict
|
26 |
+
|
27 |
+
from src.classes import hugging_face_utils as hf
|
28 |
+
from src.classes import semantic_search_engine as ss
|
29 |
+
from src.utils import default_variables as dv
|
30 |
+
from src.utils import general_utilities as gu
|
31 |
+
|
32 |
+
__author__ = ["Victor Calderon"]
|
33 |
+
__copyright__ = ["Copyright 2023 Victor Calderon"]
|
34 |
+
__all__ = []
|
35 |
+
|
36 |
+
logger = logging.getLogger(__name__)
|
37 |
+
logging.basicConfig(
|
38 |
+
level=logging.INFO,
|
39 |
+
format="%(asctime)s [%(levelname)s]: %(message)s",
|
40 |
+
)
|
41 |
+
logger.setLevel(logging.INFO)
|
42 |
+
|
43 |
+
# ---------------------------- PROJECT VARIABLES ------------------------------
|
44 |
+
|
45 |
+
MODULE_DESCRIPTION = "Module for data preparation"
|
46 |
+
MODULE_VERSION = "1.0"
|
47 |
+
|
48 |
+
|
49 |
+
# ----------------------------- INPUT PARAMETERS ------------------------------
|
50 |
+
|
51 |
+
|
52 |
+
def get_parser():
|
53 |
+
"""
|
54 |
+
Function to get the input parameters to the script.
|
55 |
+
"""
|
56 |
+
# Defining the 'parser' object to use
|
57 |
+
parser = gu._get_parser_obj(description=MODULE_DESCRIPTION)
|
58 |
+
|
59 |
+
# Path to the input dataset
|
60 |
+
parser.add_argument(
|
61 |
+
"--dataset-name",
|
62 |
+
dest="dataset_name",
|
63 |
+
default=dv.summaries_dataset_name,
|
64 |
+
type=str,
|
65 |
+
help="""
|
66 |
+
Name of the HuggingFace dataset
|
67 |
+
[Default: '%(default)s']
|
68 |
+
""",
|
69 |
+
)
|
70 |
+
# Name of the output Dataset with FAISS index and embeddings
|
71 |
+
parser.add_argument(
|
72 |
+
"--output-dataset-name",
|
73 |
+
dest="output_dataset_name",
|
74 |
+
default=dv.dataset_faiss_embeddings_name,
|
75 |
+
type=str,
|
76 |
+
help="""
|
77 |
+
Name of the output dataset that will contain a FAISS index the
|
78 |
+
text embeddings of the summaries.
|
79 |
+
[Default: '%(default)s']
|
80 |
+
""",
|
81 |
+
)
|
82 |
+
# Name of the HuggingFace repository
|
83 |
+
parser.add_argument(
|
84 |
+
"--repository-name",
|
85 |
+
dest="repository_name",
|
86 |
+
default=dv.hugging_face_repository_name,
|
87 |
+
type=str,
|
88 |
+
help="""
|
89 |
+
Name of the HuggingFace repository to use for storing artifacts.
|
90 |
+
[Default: '%(default)s']
|
91 |
+
""",
|
92 |
+
)
|
93 |
+
# Name of the FAISS Index
|
94 |
+
parser.add_argument(
|
95 |
+
"--faiss-index-name",
|
96 |
+
dest="faiss_index_name",
|
97 |
+
default=dv.faiss_index_name,
|
98 |
+
type=str,
|
99 |
+
help="""
|
100 |
+
Name of the FAISS Index of the output dataset.
|
101 |
+
[Default: '%(default)s']
|
102 |
+
""",
|
103 |
+
)
|
104 |
+
|
105 |
+
return parser.parse_args()
|
106 |
+
|
107 |
+
|
108 |
+
# ------------------------------- FUNCTIONS ----------------------------------
|
109 |
+
|
110 |
+
|
111 |
+
def create_faiss_index_and_embeddings_from_dataset(params_dict: Dict):
|
112 |
+
"""
|
113 |
+
Function to create a Dataset object with a FAISS index and the
|
114 |
+
corresponding text embeddings.
|
115 |
+
|
116 |
+
Parameters
|
117 |
+
-----------
|
118 |
+
params_dict : dict
|
119 |
+
Dictionary with set of parameters that are used throughout the project.
|
120 |
+
"""
|
121 |
+
# --- Initializing object for interacting with Datasets
|
122 |
+
hf_obj = hf.HuggingFaceHelper()
|
123 |
+
|
124 |
+
# --- Download dataset from HuggingFace Hub
|
125 |
+
dataset_obj = hf_obj.get_dataset_from_hub(
|
126 |
+
dataset_name=params_dict["dataset_name"],
|
127 |
+
username=hf_obj.username,
|
128 |
+
split="train",
|
129 |
+
)
|
130 |
+
|
131 |
+
# --- Generate the FAISS index and Text embeddings
|
132 |
+
# Initialize Semantic Search engine
|
133 |
+
semantic_search_obj = ss.SemanticSearchEngine()
|
134 |
+
|
135 |
+
# Create FAISS index and the dataset with text embeddings
|
136 |
+
dataset_with_embeddings_obj = (
|
137 |
+
semantic_search_obj.generate_corpus_index_and_embeddings(
|
138 |
+
corpus_dataset=dataset_obj
|
139 |
+
)
|
140 |
+
)
|
141 |
+
|
142 |
+
# --- Extract FAISS index and upload it to HuggingsFace Hub
|
143 |
+
# Path to the output file that will contain the FAISS index
|
144 |
+
faiss_index_local_path = str(
|
145 |
+
gu.get_project_paths()["data"].joinpath(
|
146 |
+
f'{params_dict["faiss_index_name"]}.faiss'
|
147 |
+
)
|
148 |
+
)
|
149 |
+
|
150 |
+
dataset_with_embeddings_obj.save_faiss_index(
|
151 |
+
index_name=semantic_search_obj.embeddings_colname,
|
152 |
+
file=faiss_index_local_path,
|
153 |
+
)
|
154 |
+
|
155 |
+
# Creating repository in HuggingFace
|
156 |
+
repo_name = f'{hf_obj.username}/{params_dict["repository_name"]}'
|
157 |
+
repo_type = "dataset"
|
158 |
+
|
159 |
+
_ = hf_obj.api.create_repo(
|
160 |
+
repo_id=repo_name,
|
161 |
+
repo_type=repo_type,
|
162 |
+
exist_ok=True,
|
163 |
+
)
|
164 |
+
|
165 |
+
# Uploading FAISS
|
166 |
+
hf_obj.api.upload_file(
|
167 |
+
path_or_fileobj=faiss_index_local_path,
|
168 |
+
path_in_repo=Path(faiss_index_local_path).name,
|
169 |
+
repo_id=repo_name,
|
170 |
+
repo_type=repo_type,
|
171 |
+
)
|
172 |
+
|
173 |
+
# --- Upload new Dataset to HuggingFace
|
174 |
+
# Dropping FAISS index
|
175 |
+
dataset_with_embeddings_obj.drop_index(
|
176 |
+
index_name=semantic_search_obj.embeddings_colname
|
177 |
+
)
|
178 |
+
|
179 |
+
# Pushing dataset to HuggingFace
|
180 |
+
hf_obj.push_dataset(
|
181 |
+
dataset=dataset_with_embeddings_obj,
|
182 |
+
dataset_name=params_dict["output_dataset_name"],
|
183 |
+
username=hf_obj.username,
|
184 |
+
)
|
185 |
+
|
186 |
+
return
|
187 |
+
|
188 |
+
|
189 |
+
# ------------------------------ MAIN FUNCTIONS -------------------------------
|
190 |
+
|
191 |
+
|
192 |
+
def main(params_dict: Dict):
|
193 |
+
"""
|
194 |
+
Main function for creating a dataset with FAISS index.
|
195 |
+
"""
|
196 |
+
# Showing set of input parameters
|
197 |
+
gu.show_params(params_dict=params_dict, logger=logger)
|
198 |
+
|
199 |
+
# Create FAISS index and Text embeddings for the dataset.
|
200 |
+
create_faiss_index_and_embeddings_from_dataset(params_dict=params_dict)
|
201 |
+
|
202 |
+
return
|
203 |
+
|
204 |
+
|
205 |
+
if __name__ == "__main__":
|
206 |
+
# Getting input parameters
|
207 |
+
params_dict = vars(get_parser())
|
208 |
+
# Running main function
|
209 |
+
main(params_dict=params_dict)
|
src/utils/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
src/utils/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
src/utils/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (182 Bytes). View file
|
|
src/utils/__pycache__/default_variables.cpython-39.pyc
ADDED
Binary file (1.05 kB). View file
|
|
src/utils/default_variables.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
"""
|
24 |
+
Module containing the set of default variables of the project.
|
25 |
+
"""
|
26 |
+
|
27 |
+
# Option for saving the output data to disk
|
28 |
+
save_to_disk = True
|
29 |
+
|
30 |
+
# URL to the CICERO dataset
|
31 |
+
cicero_dataset_url = "https://raw.githubusercontent.com/hamzafarooq/maven-mlsystem-design-cohort-1/main/data/df_embed.csv" # noqa: E501
|
32 |
+
|
33 |
+
# Option for saving to disk
|
34 |
+
save_to_disk = True
|
35 |
+
|
36 |
+
# Name of the column that corresponds to the Document ID
|
37 |
+
document_id_colname = "_id"
|
38 |
+
|
39 |
+
# Name of the column that corresponds to the title of the document.
|
40 |
+
title_colname = "title"
|
41 |
+
|
42 |
+
# Name of the column that contains the content of the document.
|
43 |
+
content_colname = "content"
|
44 |
+
|
45 |
+
# Name of teh target column name that will contain the parsed / clean version
|
46 |
+
# of the document's content.
|
47 |
+
clean_content_colname = "clean_content"
|
48 |
+
|
49 |
+
# Name of the 'raw' dataset
|
50 |
+
raw_dataset_name = "cicero_raw_dataset"
|
51 |
+
|
52 |
+
# Name of the 'clean' dataset
|
53 |
+
clean_dataset_name = "cicero_clean_dataset"
|
54 |
+
|
55 |
+
# Name of the dataset with summaries
|
56 |
+
summaries_dataset_name = "cicero_dataset_with_summaries"
|
57 |
+
|
58 |
+
# Name of the dataaset with embeddings and FAISS index
|
59 |
+
dataset_faiss_embeddings_name = (
|
60 |
+
"cicero_dataset_with_embeddings_and_faiss_index"
|
61 |
+
)
|
62 |
+
|
63 |
+
# Name of the environment variable with the HuggingFace Token
|
64 |
+
hugging_face_token_name = "HUGGING_FACE_HUB_TOKEN"
|
65 |
+
|
66 |
+
# Name of the environment variable with the HuggingFace Username
|
67 |
+
hugging_face_username_name = "HUGGING_FACE_USERNAME"
|
68 |
+
|
69 |
+
# Name of the HuggingFace repository
|
70 |
+
hugging_face_repository_name = "cicero_synthesizer"
|
71 |
+
|
72 |
+
# Name of the FAISS Index
|
73 |
+
faiss_index_name = "cicero_faiss_index"
|
74 |
+
|
75 |
+
# Name of the column that contains the embedding in the dataset
|
76 |
+
embeddings_colname = "embeddings"
|
src/utils/general_utilities.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MIT License
|
2 |
+
#
|
3 |
+
# Copyright (c) 2023 Victor Calderon
|
4 |
+
#
|
5 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
# of this software and associated documentation files (the "Software"), to deal
|
7 |
+
# in the Software without restriction, including without limitation the rights
|
8 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
# copies of the Software, and to permit persons to whom the Software is
|
10 |
+
# furnished to do so, subject to the following conditions:
|
11 |
+
#
|
12 |
+
# The above copyright notice and this permission notice shall be included in
|
13 |
+
# all copies or substantial portions of the Software.
|
14 |
+
#
|
15 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
# SOFTWARE.
|
22 |
+
|
23 |
+
"""
|
24 |
+
Module that includes general utitlity functions.
|
25 |
+
"""
|
26 |
+
|
27 |
+
import argparse as argparse
|
28 |
+
import logging as logging
|
29 |
+
import re
|
30 |
+
from argparse import ArgumentParser as _ArgumentParser
|
31 |
+
from argparse import HelpFormatter as _HelpFormatter
|
32 |
+
from operator import attrgetter as _attrgetter
|
33 |
+
from pathlib import Path
|
34 |
+
from typing import Dict, List, Optional, Union
|
35 |
+
|
36 |
+
import numpy as np
|
37 |
+
|
38 |
+
logger = logging.getLogger(__name__)
|
39 |
+
logging.basicConfig(level=logging.INFO)
|
40 |
+
logger.setLevel(level=logging.INFO)
|
41 |
+
|
42 |
+
|
43 |
+
__all__ = ["get_project_paths"]
|
44 |
+
|
45 |
+
|
46 |
+
def _get_root_dir():
|
47 |
+
"""
|
48 |
+
Function for determining the path to the root directory of the project.
|
49 |
+
|
50 |
+
Returns
|
51 |
+
----------
|
52 |
+
root_dir : str
|
53 |
+
Path to the root directory of the project.
|
54 |
+
"""
|
55 |
+
|
56 |
+
return str(list(Path(__file__).resolve().parents)[2].resolve())
|
57 |
+
|
58 |
+
|
59 |
+
def get_project_paths() -> Dict[str, Path]:
|
60 |
+
"""
|
61 |
+
Function to extract the set of directories of the project.
|
62 |
+
|
63 |
+
Returns
|
64 |
+
----------
|
65 |
+
proj_dict : dict
|
66 |
+
Dictionary containing the path to the project's directories.
|
67 |
+
"""
|
68 |
+
# --- Defining set of directories
|
69 |
+
# Base directory
|
70 |
+
base_dir = Path(_get_root_dir())
|
71 |
+
# Data directory
|
72 |
+
data_dir = base_dir.joinpath("data").resolve()
|
73 |
+
# Source directory / Codebase
|
74 |
+
src_dir = base_dir.joinpath("src").resolve()
|
75 |
+
|
76 |
+
# --- Creating project dictionary with the project directories
|
77 |
+
proj_dict = {
|
78 |
+
"base": base_dir,
|
79 |
+
"data": data_dir,
|
80 |
+
"src": src_dir,
|
81 |
+
}
|
82 |
+
|
83 |
+
# --- Making sure the directories exist
|
84 |
+
for directory in proj_dict.values():
|
85 |
+
directory.mkdir(
|
86 |
+
exist_ok=True,
|
87 |
+
parents=True,
|
88 |
+
)
|
89 |
+
|
90 |
+
return proj_dict
|
91 |
+
|
92 |
+
|
93 |
+
def is_float(s: str):
|
94 |
+
"""
|
95 |
+
Function that checks whether or not ``s` is a string.
|
96 |
+
"""
|
97 |
+
return s.count(".") == 1 and s.replace(".", "").isdigit()
|
98 |
+
|
99 |
+
|
100 |
+
def _str2bool(v):
|
101 |
+
if v.lower() in ("yes", "true", "t", "y", "1"):
|
102 |
+
return True
|
103 |
+
elif v.lower() in ("no", "false", "f", "n", "0"):
|
104 |
+
return False
|
105 |
+
else:
|
106 |
+
raise argparse.ArgumentTypeError("Boolean value expected.")
|
107 |
+
|
108 |
+
|
109 |
+
class SortingHelpFormatter(_HelpFormatter):
|
110 |
+
def add_arguments(self, actions):
|
111 |
+
"""
|
112 |
+
Modifier for `argparse` help parameters, that sorts them alphabetically
|
113 |
+
"""
|
114 |
+
actions = sorted(actions, key=_attrgetter("option_strings"))
|
115 |
+
super(SortingHelpFormatter, self).add_arguments(actions)
|
116 |
+
|
117 |
+
|
118 |
+
def _get_parser_obj(description: str):
|
119 |
+
"""
|
120 |
+
Function to create an 'argparse' ``parser`` object.
|
121 |
+
"""
|
122 |
+
|
123 |
+
return _ArgumentParser(
|
124 |
+
description=description,
|
125 |
+
formatter_class=SortingHelpFormatter,
|
126 |
+
)
|
127 |
+
|
128 |
+
|
129 |
+
def show_params(
|
130 |
+
params_dict: Dict,
|
131 |
+
logger: logging.Logger,
|
132 |
+
columns_to_omit: Optional[Union[List, None]] = None,
|
133 |
+
):
|
134 |
+
"""
|
135 |
+
Function to show the defined of the class.
|
136 |
+
"""
|
137 |
+
# Checking input parameters
|
138 |
+
columns_to_omit = columns_to_omit or []
|
139 |
+
#
|
140 |
+
msg = "\n" + "-" * 50 + "\n"
|
141 |
+
msg += "\t---- INPUT PARAMETERS ----" + "\n"
|
142 |
+
msg += "" + "\n"
|
143 |
+
# Sorting keys of dictionary
|
144 |
+
keys_sorted = np.sort(list(params_dict.keys()))
|
145 |
+
for key_ii in keys_sorted:
|
146 |
+
if key_ii not in columns_to_omit:
|
147 |
+
msg += f"\t>>> {key_ii} : {params_dict[key_ii]}\n"
|
148 |
+
#
|
149 |
+
msg += "\n" + "-" * 50 + "\n"
|
150 |
+
logger.info(msg)
|
151 |
+
|
152 |
+
return
|
153 |
+
|
154 |
+
|
155 |
+
def check_url_or_file_type(object_path: str) -> str:
|
156 |
+
"""
|
157 |
+
Function to determine whether the input variable is a file or a URL.
|
158 |
+
|
159 |
+
Parameters
|
160 |
+
------------
|
161 |
+
object_path : str
|
162 |
+
Path to the object.
|
163 |
+
|
164 |
+
Returns
|
165 |
+
------------
|
166 |
+
object_type : str
|
167 |
+
Type of the object.
|
168 |
+
|
169 |
+
Options :
|
170 |
+
- `url` : The object is a valid URL
|
171 |
+
- `file` : The object corresponds to a local file.
|
172 |
+
- `unspecified` : This object is neither a file nor a URL.
|
173 |
+
"""
|
174 |
+
# Set of regular expressions for each type
|
175 |
+
url_pattern = r"^https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/?\S*$"
|
176 |
+
|
177 |
+
if re.match(url_pattern, object_path):
|
178 |
+
return "url"
|
179 |
+
|
180 |
+
# Checking if 'object_path' is a file or directory
|
181 |
+
return "file" if Path(object_path).is_file() else "unspecified"
|
src/utils/gpt35_summaries/__init__.py
ADDED
File without changes
|
src/utils/gpt35_summaries/cleanup_and_summarize.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# sourcery skip: use-named-expression
|
2 |
+
import csv
|
3 |
+
import sys
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
from .summarizer import Summarizer
|
7 |
+
|
8 |
+
P_SCRIPT_DIR = Path(__file__).parent
|
9 |
+
|
10 |
+
csv.field_size_limit(sys.maxsize)
|
11 |
+
|
12 |
+
|
13 |
+
HTML_STRINGS = []
|
14 |
+
with open(P_SCRIPT_DIR / "html_tags.txt") as f:
|
15 |
+
for line in f:
|
16 |
+
tag = line.strip()
|
17 |
+
if tag:
|
18 |
+
HTML_STRINGS.append(tag)
|
19 |
+
HTML_STRINGS.extend(("target_blank", "relnoopen", "relnofollow"))
|
20 |
+
HTML_STRINGS = tuple(HTML_STRINGS)
|
21 |
+
|
22 |
+
WORDS = set()
|
23 |
+
with open(P_SCRIPT_DIR / "words_alpha.txt") as f:
|
24 |
+
for line in f:
|
25 |
+
word = line.strip()
|
26 |
+
if word:
|
27 |
+
WORDS.add(word)
|
28 |
+
|
29 |
+
|
30 |
+
def filter_content(content):
|
31 |
+
c_words = content.split()
|
32 |
+
c_filt_words = []
|
33 |
+
for w in c_words:
|
34 |
+
if w not in WORDS:
|
35 |
+
while w.startswith(HTML_STRINGS):
|
36 |
+
smax = ""
|
37 |
+
for s in HTML_STRINGS:
|
38 |
+
if w.startswith(s) and len(s) > len(smax):
|
39 |
+
smax = s
|
40 |
+
w = w[len(smax) :] # noqa: E203
|
41 |
+
while w.endswith(HTML_STRINGS):
|
42 |
+
smax = ""
|
43 |
+
for s in HTML_STRINGS:
|
44 |
+
if w.endswith(s) and len(s) > len(smax):
|
45 |
+
smax = s
|
46 |
+
w = w[len(smax) :] # noqa: E203
|
47 |
+
if w:
|
48 |
+
c_filt_words.append(w)
|
49 |
+
return " ".join(c_filt_words)
|
50 |
+
|
51 |
+
|
52 |
+
def main():
|
53 |
+
DF_EMBED_OUT_DICT = {}
|
54 |
+
if (P_SCRIPT_DIR / "df_embed_out.csv").exists():
|
55 |
+
with open(
|
56 |
+
P_SCRIPT_DIR / "df_embed_out.csv",
|
57 |
+
encoding="ascii",
|
58 |
+
errors="ignore",
|
59 |
+
) as fin:
|
60 |
+
for csv_row in csv.DictReader(fin):
|
61 |
+
DF_EMBED_OUT_DICT[csv_row["_id"]] = csv_row
|
62 |
+
|
63 |
+
SUMMARIZER = Summarizer()
|
64 |
+
|
65 |
+
with open(
|
66 |
+
P_SCRIPT_DIR / "df_embed.csv", encoding="ascii", errors="ignore"
|
67 |
+
) as fin, open(
|
68 |
+
P_SCRIPT_DIR / "df_embed_out.csv",
|
69 |
+
"w",
|
70 |
+
encoding="ascii",
|
71 |
+
errors="ignore",
|
72 |
+
) as fout:
|
73 |
+
csv_reader = csv.DictReader(fin)
|
74 |
+
fieldnames = csv_reader.fieldnames[:]
|
75 |
+
fieldnames.append("summary")
|
76 |
+
fieldnames.append("content_filtered")
|
77 |
+
csv_writer = csv.DictWriter(fout, fieldnames)
|
78 |
+
csv_writer.writeheader()
|
79 |
+
for csv_row in csv_reader:
|
80 |
+
if csv_row["_id"] in DF_EMBED_OUT_DICT:
|
81 |
+
print("Re-using existing data for", csv_row["_id"])
|
82 |
+
csv_row = DF_EMBED_OUT_DICT[csv_row["_id"]]
|
83 |
+
if not csv_row["title"] and not csv_row["content"]:
|
84 |
+
csv_row["content"] = csv_row["_id"]
|
85 |
+
csv_row["_id"] = ""
|
86 |
+
content_filtered = filter_content(csv_row["content"])
|
87 |
+
print(content_filtered)
|
88 |
+
csv_row["content_filtered"] = content_filtered
|
89 |
+
# input()
|
90 |
+
if not csv_row.get("summary") and (
|
91 |
+
csv_row["title"] or csv_row["content_filtered"]
|
92 |
+
):
|
93 |
+
print("Running GPT...\n")
|
94 |
+
while True:
|
95 |
+
summary = SUMMARIZER.summarize(
|
96 |
+
csv_row["title"], content_filtered
|
97 |
+
)
|
98 |
+
if summary:
|
99 |
+
break
|
100 |
+
# input()
|
101 |
+
csv_row["summary"] = summary
|
102 |
+
csv_writer.writerow(csv_row)
|
103 |
+
fout.flush()
|
104 |
+
|
105 |
+
|
106 |
+
if __name__ == "__main__":
|
107 |
+
main()
|
src/utils/gpt35_summaries/html_tags.txt
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
a
|
2 |
+
abbr
|
3 |
+
address
|
4 |
+
area
|
5 |
+
article
|
6 |
+
aside
|
7 |
+
audio
|
8 |
+
b
|
9 |
+
base
|
10 |
+
bdi
|
11 |
+
bdo
|
12 |
+
blockquote
|
13 |
+
body
|
14 |
+
br
|
15 |
+
button
|
16 |
+
canvas
|
17 |
+
caption
|
18 |
+
cite
|
19 |
+
code
|
20 |
+
col
|
21 |
+
colgroup
|
22 |
+
data
|
23 |
+
datalist
|
24 |
+
dd
|
25 |
+
del
|
26 |
+
details
|
27 |
+
dfn
|
28 |
+
dialog
|
29 |
+
div
|
30 |
+
dl
|
31 |
+
dt
|
32 |
+
em
|
33 |
+
embed
|
34 |
+
fieldset
|
35 |
+
figcaption
|
36 |
+
figure
|
37 |
+
footer
|
38 |
+
form
|
39 |
+
h1
|
40 |
+
h2
|
41 |
+
h3
|
42 |
+
h4
|
43 |
+
h5
|
44 |
+
h6
|
45 |
+
head
|
46 |
+
header
|
47 |
+
hr
|
48 |
+
html
|
49 |
+
i
|
50 |
+
iframe
|
51 |
+
img
|
52 |
+
input
|
53 |
+
ins
|
54 |
+
kbd
|
55 |
+
label
|
56 |
+
legend
|
57 |
+
li
|
58 |
+
link
|
59 |
+
main
|
60 |
+
map
|
61 |
+
mark
|
62 |
+
meta
|
63 |
+
meter
|
64 |
+
nav
|
65 |
+
noscript
|
66 |
+
object
|
67 |
+
ol
|
68 |
+
optgroup
|
69 |
+
option
|
70 |
+
output
|
71 |
+
p
|
72 |
+
param
|
73 |
+
picture
|
74 |
+
pre
|
75 |
+
progress
|
76 |
+
q
|
77 |
+
rp
|
78 |
+
rt
|
79 |
+
ruby
|
80 |
+
s
|
81 |
+
samp
|
82 |
+
script
|
83 |
+
section
|
84 |
+
select
|
85 |
+
small
|
86 |
+
source
|
87 |
+
span
|
88 |
+
strong
|
89 |
+
style
|
90 |
+
sub
|
91 |
+
summary
|
92 |
+
sup
|
93 |
+
table
|
94 |
+
tbody
|
95 |
+
td
|
96 |
+
template
|
97 |
+
textarea
|
98 |
+
tfoot
|
99 |
+
th
|
100 |
+
thead
|
101 |
+
time
|
102 |
+
title
|
103 |
+
tr
|
104 |
+
track
|
105 |
+
u
|
106 |
+
ul
|
107 |
+
var
|
108 |
+
video
|
109 |
+
wbr
|
src/utils/gpt35_summaries/summarizer.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
import requests
|
7 |
+
import tiktoken
|
8 |
+
|
9 |
+
|
10 |
+
class Summarizer:
|
11 |
+
def __init__(self, **kwargs):
|
12 |
+
self.openai_endpoint = "https://api.openai.com/v1/chat/completions"
|
13 |
+
|
14 |
+
# Prompt template
|
15 |
+
self.prompt_template = self._get_prompt_template()
|
16 |
+
|
17 |
+
# Type of model to use
|
18 |
+
self.model = kwargs.get("model", "gpt-3.5-turbo")
|
19 |
+
|
20 |
+
# Model hyperparameters
|
21 |
+
self.max_tokens = kwargs.get("max_tokens", 4096)
|
22 |
+
self.result_tokens = kwargs.get("result_tokens", 300)
|
23 |
+
|
24 |
+
# Model encoding
|
25 |
+
self.model_encoding = self._get_model_encoding()
|
26 |
+
|
27 |
+
# Token length of the prompt template
|
28 |
+
self.prompt_token_length = self._get_number_of_tokens(
|
29 |
+
self.prompt_template
|
30 |
+
)
|
31 |
+
|
32 |
+
def _get_prompt_template(self, search_string=None) -> str:
|
33 |
+
# Defining the template to use
|
34 |
+
template_text = """
|
35 |
+
Create a concise, clear, and in-depth summary of the following online
|
36 |
+
article. Adhere to the following guidelines:
|
37 |
+
|
38 |
+
1. Sound professional, detached and avoid emotionally charged language.
|
39 |
+
2. Make sure to describe who is discussed in the article, what are
|
40 |
+
the events or concepts, when things happened, and, if this information is
|
41 |
+
available, why.
|
42 |
+
3. The summary should be between one and three paragraphs.
|
43 |
+
"""
|
44 |
+
if search_string:
|
45 |
+
template_text += f"""
|
46 |
+
4. Make sure to include and emphasize any information in the article that
|
47 |
+
relates to the following search string:
|
48 |
+
"{search_string}"
|
49 |
+
"""
|
50 |
+
|
51 |
+
return template_text
|
52 |
+
|
53 |
+
def _get_model_encoding(self):
|
54 |
+
return tiktoken.encoding_for_model(self.model)
|
55 |
+
|
56 |
+
def _get_number_of_tokens(self, input_text: str) -> int:
|
57 |
+
"""
|
58 |
+
Method for determining the number of tokens of the input text.
|
59 |
+
|
60 |
+
Parameters
|
61 |
+
-----------
|
62 |
+
input_text : str
|
63 |
+
Text to use for calculating its token length.
|
64 |
+
|
65 |
+
Returns
|
66 |
+
---------
|
67 |
+
text_token_length : int
|
68 |
+
Lenght of the tokens of the input text.
|
69 |
+
"""
|
70 |
+
|
71 |
+
return len(self.model_encoding.encode(input_text))
|
72 |
+
|
73 |
+
def _run_model(
|
74 |
+
self,
|
75 |
+
user_content: str,
|
76 |
+
search_string: Optional[str] = None,
|
77 |
+
temperature: Optional[float] = 1,
|
78 |
+
):
|
79 |
+
"""
|
80 |
+
Method for running the model that will create the summary for a given
|
81 |
+
observation.
|
82 |
+
|
83 |
+
Parameters
|
84 |
+
------------
|
85 |
+
user_content : str
|
86 |
+
Content by the user that will be sent to the model via its API.
|
87 |
+
|
88 |
+
temperature : float, optional
|
89 |
+
Amount of ``temperature`` to give to the model. This parameter
|
90 |
+
handles the amount of creativity that the model can have when
|
91 |
+
creating the output response. This variable is set to ``1`` by
|
92 |
+
default.
|
93 |
+
|
94 |
+
Returns
|
95 |
+
----------
|
96 |
+
"""
|
97 |
+
# Creating the headers
|
98 |
+
headers = {
|
99 |
+
"Content-Type": "application/json",
|
100 |
+
"Authorization": f'Bearer {os.environ["OPENAI_API_KEY"]}',
|
101 |
+
}
|
102 |
+
# Composing the input messages
|
103 |
+
messages = [
|
104 |
+
{
|
105 |
+
"role": "system",
|
106 |
+
"content": self._get_prompt_template(search_string),
|
107 |
+
},
|
108 |
+
{"role": "user", "content": user_content},
|
109 |
+
]
|
110 |
+
# Parsing the request data
|
111 |
+
request_data = {
|
112 |
+
"model": self.model,
|
113 |
+
"messages": messages,
|
114 |
+
"temperature": temperature,
|
115 |
+
}
|
116 |
+
# Extracting the response from the model's API
|
117 |
+
response = requests.post(
|
118 |
+
self.openai_endpoint,
|
119 |
+
headers=headers,
|
120 |
+
data=json.dumps(request_data),
|
121 |
+
timeout=60,
|
122 |
+
)
|
123 |
+
|
124 |
+
# Checkig if the response was OK
|
125 |
+
if response.status_code == 200:
|
126 |
+
return response.json()["choices"][0]["message"]["content"]
|
127 |
+
else:
|
128 |
+
raise RuntimeError(
|
129 |
+
f"HTTP request failed {response.status_code}, {response.text}"
|
130 |
+
)
|
131 |
+
|
132 |
+
def summarize(self, title, content, search_string=None):
|
133 |
+
content_for_summary = f"{title}\n\n{content}"
|
134 |
+
prompt_token_length = (
|
135 |
+
self.prompt_token_length
|
136 |
+
if search_string
|
137 |
+
else self._get_number_of_tokens(
|
138 |
+
self._get_prompt_template(search_string)
|
139 |
+
)
|
140 |
+
)
|
141 |
+
data_token_length = self._get_number_of_tokens(content_for_summary)
|
142 |
+
while data_token_length + prompt_token_length > self.max_tokens - 10:
|
143 |
+
print("Decimating the content.")
|
144 |
+
content = content.split()
|
145 |
+
del content[::10]
|
146 |
+
content = " ".join(content)
|
147 |
+
content_for_summary = f"{title}\n\n{content}"
|
148 |
+
data_token_length = self._get_number_of_tokens(content_for_summary)
|
149 |
+
|
150 |
+
while True:
|
151 |
+
try:
|
152 |
+
return self._run_model(
|
153 |
+
user_content=content_for_summary,
|
154 |
+
search_string=search_string,
|
155 |
+
)
|
156 |
+
except Exception as e:
|
157 |
+
print(e, file=sys.stderr)
|
template.envrc
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -------------------- Defining default environment ---------------------------
|
2 |
+
|
3 |
+
# --- Docker BuildKit
|
4 |
+
export DOCKER_BUILDKIT_VALUE=1
|
5 |
+
|
6 |
+
# --- Project variables
|
7 |
+
export INPUT_APP_PORT=8501
|
8 |
+
export OUTPUT_APP_PORT=8501
|
9 |
+
export APP_SERVER_PORT=7860
|
10 |
+
|
11 |
+
export HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
|
12 |
+
export HUGGING_FACE_USERNAME=${HUGGING_FACE_USERNAME}
|
13 |
+
|
14 |
+
export PATH="${PWD}:${PATH}"
|
15 |
+
export PYTHONPATH="${PWD}:${PYTHONPATH}"
|