Spaces:

tosi-n7
/

ark-instruct-line-item

No application file

App Files Files Community

tosi-n7 commited on Dec 4, 2023

Commit

d8ffdc4

•

1 Parent(s): 9adc765

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +119 -0
.gitignore +142 -0
README.md +100 -8
chimp/.gitignore +160 -0
chimp/requirements.txt +21 -0
chimp/src/config.py +81 -0
chimp/src/dataset.py +31 -0
chimp/src/model.py +62 -0
chimp/src/predict.py +87 -0
chimp/src/train.py +66 -0
data/alm_task_data.csv +0 -0
data/jack_line_item_ner_task.csv +0 -0
data/jack_line_item_ner_task_v2.csv +0 -0
data/line_item_and_alm_data.json +0 -0
data/line_item_and_alm_data_v1.json +3 -0
data_prep.py +28 -0
demo.sh +5 -0
gorilla/__pycache__/llama_attn_replace.cpython-310.pyc +0 -0
gorilla/__pycache__/llama_attn_replace_sft.cpython-310.pyc +0 -0
gorilla/api.py +0 -0
gorilla/app.py +211 -0
gorilla/code_interpreter.py +117 -0
gorilla/ds_configs/stage2.json +23 -0
gorilla/ds_configs/stage3.json +49 -0
gorilla/eval.py +175 -0
gorilla/fine-tune.py +206 -0
gorilla/get_trainable_weights.py +37 -0
gorilla/infer.py +143 -0
gorilla/llama_attn_replace.py +477 -0
gorilla/llama_attn_replace_sft.py +483 -0
gorilla/merge_lora_weights_and_save_hf_model.py +100 -0
gorilla/push_to_hub.py +5 -0
gorilla/requirements.txt +19 -0
gorilla/stream_jack.py +183 -0
gorilla/streaming_llm/__init__.py +0 -0
gorilla/streaming_llm/__pycache__/__init__.cpython-310.pyc +0 -0
gorilla/streaming_llm/__pycache__/enable_streaming_llm.cpython-310.pyc +0 -0
gorilla/streaming_llm/__pycache__/kv_cache.cpython-310.pyc +0 -0
gorilla/streaming_llm/__pycache__/utils.cpython-310.pyc +0 -0
gorilla/streaming_llm/enable_streaming_llm.py +38 -0
gorilla/streaming_llm/kv_cache.py +119 -0
gorilla/streaming_llm/pos_shift/__init__.py +0 -0
gorilla/streaming_llm/pos_shift/__pycache__/__init__.cpython-310.pyc +0 -0
gorilla/streaming_llm/pos_shift/__pycache__/modify_llama.cpython-310.pyc +0 -0
gorilla/streaming_llm/pos_shift/modify_falcon.py +162 -0
gorilla/streaming_llm/pos_shift/modify_llama.py +311 -0
gorilla/streaming_llm/utils.py +112 -0
gorilla/style.css +16 -0
gorilla/supervised-fine-tune-qlora.py +345 -0
gorilla/supervised-fine-tune.py +330 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,122 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/line_item_and_alm_data_v1.json filter=lfs diff=lfs merge=lfs -text
+venv/bin/python filter=lfs diff=lfs merge=lfs -text
+venv/bin/python3 filter=lfs diff=lfs merge=lfs -text
+venv/bin/python3.10 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/Pillow.libs/libfreetype-82733d78.so.6.20.1 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/Pillow.libs/libharfbuzz-e3b74c67.so.0.60821.0 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/aiohttp/_http_parser.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/altair/vegalite/v5/schema/__pycache__/core.cpython-310.pyc filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda110.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda111.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda114.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda115.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda122.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_cython.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/debugpy/_vendored/pydevd/_pydevd_frame_eval/pydevd_frame_evaluator.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/fontTools/feaLib/lexer.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/fontTools/misc/bezierTools.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/fontTools/pens/momentsPen.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/fontTools/qu2cu/qu2cu.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/fontTools/varLib/iup.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/gradio/frpc_linux_amd64_v0.2 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/gradio/templates/cdn/assets/Index-5c805b1c.js.map filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/gradio/templates/frontend/assets/Index-62000a79.js.map filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/kiwisolver/_cext.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/matplotlib/_image.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/matplotlib/_path.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/matplotlib/_qhull.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/matplotlib/backends/_backend_agg.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/matplotlib/ft2font.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/numpy/core/_multiarray_umath.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/numpy/core/_simd.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/numpy.libs/libgfortran-040039e1.so.5.0.0 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/numpy.libs/libopenblas64_p-r0-0cf96a72.3.23.dev.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvfuser/_C.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cublas/lib/libcublas.so.12 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cublas/lib/libcublasLt.so.12 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cuda_cupti/lib/libcheckpoint.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cuda_cupti/lib/libcupti.so.12 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cuda_cupti/lib/libnvperf_host.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cuda_cupti/lib/libnvperf_target.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc-builtins.so.12.1 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.12 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_adv_infer.so.8 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_adv_train.so.8 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_cnn_infer.so.8 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_cnn_train.so.8 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_ops_infer.so.8 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cudnn/lib/libcudnn_ops_train.so.8 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cufft/lib/libcufft.so.11 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cufft/lib/libcufftw.so.11 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/curand/lib/libcurand.so.10 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cusolver/lib/libcusolver.so.11 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cusolver/lib/libcusolverMg.so.11 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/cusparse/lib/libcusparse.so.12 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/nvidia/nvjitlink/lib/libnvJitLink.so.12 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pandas/_libs/algos.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pandas/_libs/groupby.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pandas/_libs/hashtable.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pandas/_libs/interval.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pandas/_libs/join.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pandas/_libs/tslibs/offsets.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/_compute.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/_dataset.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/_flight.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/lib.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/libarrow.so.1400 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/libarrow_acero.so.1400 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/libarrow_dataset.so.1400 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/libarrow_flight.so.1400 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/libarrow_python.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/libarrow_substrait.so.1400 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyarrow/libparquet.so.1400 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pydantic_core/_pydantic_core.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyzmq.libs/libsodium-cb25555f.so.23.3.0 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/pyzmq.libs/libzmq-f468291a.so.5.2.4 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/regex/_regex.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/rpds/rpds.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/safetensors/_safetensors_rust.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/fft/_pocketfft/pypocketfft.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/linalg/_flapack.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/misc/face.dat filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/optimize/_highs/_highs_wrapper.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/sparse/_sparsetools.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/spatial/_ckdtree.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/spatial/_qhull.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/special/_ufuncs.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/special/cython_special.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy/stats/_unuran/unuran_wrapper.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy.libs/libgfortran-040039e1.so.5.0.0 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/scipy.libs/libopenblasp-r0-23e5df77.3.21.dev.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/sentencepiece/_sentencepiece.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/tokenizers/tokenizers.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/bin/nvfuser_tests filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/bin/protoc filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/bin/protoc-3.13.0.0 filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/lib/libc10.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/lib/libtorch_cuda_linalg.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/torch/lib/libtorch_python.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/triton/_C/libtriton.so filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/triton/third_party/cuda/bin/ptxas filter=lfs diff=lfs merge=lfs -text
+venv/lib/python3.10/site-packages/yaml/_yaml.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,142 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Json files
+*.json
+*.misc
+# model files
+models/*
+gfpgan/weights/*
+test_cors.html
+jack-alm/
+jack-alm-13b-8k-hf/
+cache/

README.md CHANGED Viewed

@@ -1,12 +1,104 @@
 ---
-title: Ark Instruct Line Item
-emoji: 🏢
-colorFrom: purple
-colorTo: pink
 sdk: gradio
-sdk_version: 4.7.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ark-instruct-line-item
+app_file: /home/tosi-n/ark/gorilla/app.py
 sdk: gradio
+sdk_version: 4.0.2
 ---
+# ARK - Jack's Accounting ALM Training Framework
+This is a base pipeline to build out our task specific repos layer on
+> Note ReadMe currently for gorrila module which is mainly pipeline for long context window training flexibility. For the chimp module which is for default context window training, it has it's ReadMe in folder path
+## Usage Requirements
+To download and use the [pre-trained weights](#pre-trained-weights) you will need:
+1. Hugging Face (HF) account with valid email. Note, the email used for HF must alse be used for the license agreement.
+2. Accept the Meta [license and acceptable use policy](https://ai.meta.com/resources/models-and-libraries/llama-downloads/)
+## Installation and Quick Guide
+To install and run the application:
+1. Clone the repository on your local machine, using git clone and pasting the url of this project.
+2. Complete pre-requiste installation. Run the following code:
+```
+pip install -r requirements.txt
+pip install flash-attn --no-build-isolation
+```
+3. Training Pre-trained weights through Fine-tuning QLoRa, LoRa or full. Run the following code in bash script and update args before running for QloRa :
+```
+sh runner.sh
+```
+4. Merge amd get Trainable LoRA Weight. Run the following code in bash script and update args where necessary:
+```
+sh process_wt.sh
+```
+5. Test your model by terminal chat. Run the following code in bash script andupdate args where necessary:
+```
+sh stream.sh
+```
+6. Test your model on gradio UI. Run the following code in bash script and update args where necessary:
+```
+sh demo.sh
+```
+## Training args for Full and LoRA
+### Fine-tuning
+```
+torchrun --nproc_per_node=8 fine-tune.py  \
+        --model_name_or_path path_to/Llama-2-7b-hf \
+        --bf16 True \
+        --output_dir path_to_saving_checkpoints       \
+        --cache_dir path_to_cache \
+        --model_max_length 8192 \
+        --use_flash_attn True \
+        --low_rank_training False \
+        --num_train_epochs 1  \
+        --per_device_train_batch_size 1     \
+        --per_device_eval_batch_size 2     \
+        --gradient_accumulation_steps 8     \
+        --evaluation_strategy "no"     \
+        --save_strategy "steps"     \
+        --save_steps 1000     \
+        --save_total_limit 2     \
+        --learning_rate 2e-5     \
+        --weight_decay 0.0     \
+        --warmup_steps 20     \
+        --lr_scheduler_type "constant_with_warmup"     \
+        --logging_steps 1     \
+        --deepspeed "ds_configs/stage2.json" \
+        --tf32 True \
+        --max_steps 1000
+```
+### Supervised Fine-tuning
+```
+torchrun --nproc_per_node=8 supervised-fine-tune.py  \
+        --model_name_or_path path_to_Llama2_chat_models \
+        --bf16 True \
+        --output_dir path_to_saving_checkpoints       \
+        --model_max_length 32768 \
+        --use_flash_attn True \
+        --data_path LongAlpaca-12k.json \
+        --low_rank_training True \
+        --num_train_epochs 3  \
+        --per_device_train_batch_size 1     \
+        --per_device_eval_batch_size 2     \
+        --gradient_accumulation_steps 1     \
+        --evaluation_strategy "no"     \
+        --save_strategy "steps"     \
+        --save_steps 1000     \
+        --save_total_limit 2     \
+        --learning_rate 2e-5     \
+        --weight_decay 0.0     \
+        --warmup_steps 20     \
+        --lr_scheduler_type "constant_with_warmup"     \
+        --logging_steps 1     \
+        --deepspeed "ds_configs/stage2.json" \
+        --tf32 True
+```
+## Evaluation
+### Perplexity Validation

chimp/.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

chimp/requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+accelerate
+appdirs
+bert_score
+bitsandbytes
+black
+black[jupyter]
+datasets
+deepspeed
+einops
+fire
+flask
+gradio
+huggingface-hub
+jsonlines
+loralib
+peft
+pycuda
+sentencepiece
+spacy_fastlang
+transformers
+# triton

chimp/src/config.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Used for multi-gpu
+local_rank = -1
+per_device_train_batch_size = 4
+per_device_eval_batch_size = 4
+gradient_accumulation_steps = 1
+learning_rate = 2e-4
+max_grad_norm = 0.3
+weight_decay = 0.001
+lora_alpha = 16
+lora_dropout = 0.1
+lora_r = 64
+max_seq_length = None
+# The model that you want to train from the Hugging Face hub
+model_name = "guardrail/llama-2-7b-guanaco-instruct-sharded"
+# Fine-tuned model name
+new_model = "llama-2-7b-custom-accountant"
+# The instruction dataset to use
+# dataset_name = "databricks/databricks-dolly-15k"
+# Activate 4-bit precision base model loading
+use_4bit = True
+# Activate nested quantization for 4-bit base models
+use_nested_quant = False
+# Compute dtype for 4-bit base models
+bnb_4bit_compute_dtype = "float16"
+# Quantization type (fp4 or nf4)
+bnb_4bit_quant_type = "nf4"
+# Number of training epochs
+num_train_epochs = 2
+# Enable fp16 training, (bf16 to True with an A100)
+fp16 = False
+# Enable bf16 training
+bf16 = False
+# Use packing dataset creating
+packing = False
+# Enable gradient checkpointing
+gradient_checkpointing = True
+# Optimizer to use, original is paged_adamw_32bit
+optim = "paged_adamw_32bit"
+# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)
+lr_scheduler_type = "cosine"
+# Number of optimizer update steps, 10K original, 20 for demo purposes
+max_steps = -1
+# Fraction of steps to do a warmup for
+warmup_ratio = 0.03
+# Group sequences into batches with same length (saves memory and speeds up training considerably)
+group_by_length = True
+# Save checkpoint every X updates steps
+save_steps = 10
+# Log every X updates steps
+logging_steps = 1
+# The output directory where the model predictions and checkpoints will be written
+output_dir = "../model_files/"
+# Load the entire model on the GPU 0
+device_map = {"": 0}
+# Visualize training
+report_to = "tensorboard"
+# Tensorboard logs
+tb_log_dir = "../logs/"

chimp/src/dataset.py ADDED Viewed

	@@ -0,0 +1,31 @@

+class CustomDataset:
+    def __init__(self, data):
+        self.features = ['instruction', 'context', 'response']
+        self.num_rows = len(data)
+        self.data = data
+    def __getitem__(self, idx):
+        if idx < 0 or idx >= self.num_rows:
+            raise IndexError("Index out of range")
+        return {
+            'instruction': self.data[idx]['instruction'],
+            'context': self.data[idx]['context'],
+            'response': self.data[idx]['response']
+        }
+    def __repr__(self):
+        return f"Dataset({{'features': {self.features}, 'num_rows': {self.num_rows}}})"
+def format_data(sample):
+    instruction = f"<s>[INST] {sample['instruction']}"
+    context = f"Here's some context: {sample['context']}" if len(sample["context"]) > 0 else None
+    response = f" [/INST] {sample['response']}"
+    # join all the parts together
+    prompt = "".join([i for i in [instruction, context, response] if i is not None])
+    return prompt
+# template dataset to add prompt to each sample
+def template_dataset(sample, tokenizer):
+    sample["text"] = f"{format_data(sample)}{tokenizer.eos_token}"
+    return sample

chimp/src/model.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import pandas as pd
+import os
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    HfArgumentParser,
+    pipeline,
+    logging,
+)
+from peft import LoraConfig, PeftModel, get_peft_model
+from guardrail.client import (
+    run_metrics,
+    run_simple_metrics,
+    create_dataset)
+import src.config as config
+def load_model(model_name):
+    # Load tokenizer and model with QLoRA configuration
+    compute_dtype = getattr(torch, config.bnb_4bit_compute_dtype)
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=config.use_4bit,
+        bnb_4bit_quant_type=config.bnb_4bit_quant_type,
+        bnb_4bit_compute_dtype=compute_dtype,
+        bnb_4bit_use_double_quant=config.use_nested_quant,
+    )
+    if compute_dtype == torch.float16 and config.use_4bit:
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 8:
+            print("=" * 80)
+            print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
+            print("=" * 80)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map=config.device_map,
+        quantization_config=bnb_config
+    )
+    model.config.use_cache = False
+    model.config.pretraining_tp = 1
+    # Load LoRA configuration
+    peft_config = LoraConfig(
+        lora_alpha=config.lora_alpha,
+        lora_dropout=config.lora_dropout,
+        r=config.lora_r,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    # Load Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    return model, tokenizer, peft_config

chimp/src/predict.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import pandas as pd
+import os
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    HfArgumentParser,
+    TrainingArguments,
+    pipeline,
+    logging,
+)
+from peft import LoraConfig, PeftModel, get_peft_model
+from trl import SFTTrainer
+from guardrail.client import (
+    run_metrics,
+    run_simple_metrics,
+    create_dataset)
+import src.config
+# from model import load_model
+def text_gen_eval_wrapper(model, tokenizer, prompt, model_id=1, show_metrics=True, temp=0.7, max_length=200):
+    """
+    A wrapper function for inferencing, evaluating, and logging text generation pipeline.
+    Parameters:
+        model (str or object): The model name or the initialized text generation model.
+        tokenizer (str or object): The tokenizer name or the initialized tokenizer for the model.
+        prompt (str): The input prompt text for text generation.
+        model_id (int, optional): An identifier for the model. Defaults to 1.
+        show_metrics (bool, optional): Whether to calculate and show evaluation metrics.
+                                       Defaults to True.
+        max_length (int, optional): The maximum length of the generated text sequence.
+                                    Defaults to 200.
+    Returns:
+        generated_text (str): The generated text by the model.
+        metrics (dict): Evaluation metrics for the generated text (if show_metrics is True).
+    """
+    # Suppress Hugging Face pipeline logging
+    logging.set_verbosity(logging.CRITICAL)
+    # Initialize the pipeline
+    pipe = pipeline(task="text-generation",
+                    model=model,
+                    tokenizer=tokenizer,
+                    max_length=max_length,
+                    do_sample=True,
+                    temperature=temp)
+    # Generate text using the pipeline
+    pipe = pipeline(task="text-generation",
+                    model=model,
+                    tokenizer=tokenizer,
+                    max_length=200)
+    result = pipe(f"<s>[INST] {prompt} [/INST]")
+    generated_text = result[0]['generated_text']
+    # Find the index of "### Assistant" in the generated text
+    index = generated_text.find("[/INST] ")
+    if index != -1:
+        # Extract the substring after "### Assistant"
+        substring_after_assistant = generated_text[index + len("[/INST] "):].strip()
+    else:
+        # If "### Assistant" is not found, use the entire generated text
+        substring_after_assistant = generated_text.strip()
+    if show_metrics:
+        # Calculate evaluation metrics
+        metrics = run_metrics(substring_after_assistant, prompt, model_id)
+        return substring_after_assistant, metrics
+    else:
+        return substring_after_assistant
+if __name__=='__main__':
+    huggingface_profile = "jenesys-ai"
+    full_path = huggingface_profile + "/" + config.new_model
+    model, tokenizer, peft_config = load_model(full_path)
+    prompt="Who were the children of the legendary Garth Greenhand, the High King of the First Men in the series A Song of Ice and Fire?"
+    text_gen_eval_wrapper(model, tokenizer, prompt, show_metrics=False)

chimp/src/train.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pandas as pd
+from transformers import TrainingArguments
+from trl import SFTTrainer
+from peft import LoraConfig, PeftModel, get_peft_model
+from model import load_model
+import config  # Make sure you have a valid config module
+from dataset import CustomDataset, template_dataset
+from datasets import Dataset, Features, Value, Sequence
+if __name__ == '__main__':
+    model, tokenizer, peft_config = load_model(config.model_name)
+    df = pd.read_csv('../data/trainv2.csv') #data
+    data_list = df.to_dict(orient='records')
+    # custom dataset object
+    custom_dataset = CustomDataset(data_list)
+    df = pd.DataFrame(custom_dataset.data, columns=["instruction", "context", "response"])
+    # Dataset features
+    features = Features({
+        "instruction": Value("string"),
+        "context": Value("string"),
+        "response": Value("string"),
+    })
+    # Create a Hugging Face Dataset from the Pandas DataFrame
+    hugging_face_dataset = Dataset.from_pandas(df, features=features)
+    dataset = hugging_face_dataset.map(lambda x: template_dataset(x, tokenizer), remove_columns=list(hugging_face_dataset.features))
+    print("----training data structure----",dataset)
+    # Training Arguments
+    training_arguments = TrainingArguments(
+        output_dir=config.output_dir,
+        per_device_train_batch_size=config.per_device_train_batch_size,
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        optim=config.optim,
+        save_steps=config.save_steps,
+        logging_steps=config.logging_steps,
+        learning_rate=config.learning_rate,
+        fp16=config.fp16,
+        bf16=config.bf16,
+        max_grad_norm=config.max_grad_norm,
+        max_steps=config.max_steps,
+        warmup_ratio=config.warmup_ratio,
+        group_by_length=config.group_by_length,
+        lr_scheduler_type=config.lr_scheduler_type,
+        report_to="tensorboard"
+    )
+    # SFTTrainer
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=dataset,
+        peft_config=peft_config,
+        dataset_text_field="text",
+        max_seq_length=config.max_seq_length,
+        tokenizer=tokenizer,
+        args=training_arguments,
+        packing=config.packing,
+    )
+    print("**************** TRAINING STARTED ****************")
+    trainer.train()
+    trainer.model.save_pretrained(config.output_dir)
+    print("**************** TRAINING OVER ****************")

data/alm_task_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/jack_line_item_ner_task.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/jack_line_item_ner_task_v2.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/line_item_and_alm_data.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/line_item_and_alm_data_v1.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:952a2a1a884ad30e3ffe8392dbde698fe799baccc7e687291174b1702cfe6e5c
+size 10552104

data_prep.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# %%
+import pandas as pd
+df_i = pd.read_csv('/home/tosi-n/ark/data/jack_line_item_ner_task_v2.csv', sep='\t')
+df_ii = pd.read_csv('/home/tosi-n/ark/data/jack_line_item_ner_task.csv', sep='\t')
+display(df_i.head())
+display(df_ii.head())
+# %%
+df_i = df_i[['context', 'instruction', 'response']]
+df_ii = df_ii[['context', 'instruction', 'response']]
+df = pd.concat([df_i, df_ii])
+df.rename(columns={'context': 'input', 'response': 'output'}, inplace=True)
+display(df.head())
+# %%
+# check for nan values
+df.isna().sum()
+# %%
+# drop nan values
+df.dropna(inplace=True)
+# %%
+df.to_json('/home/tosi-n/ark/data/line_item_and_alm_data_v1.json', orient='records')
+# %%

demo.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+python3 /home/tosi-n/ark/gorilla/app.py  \
+	--base_model /home/tosi-n/ark/jack-alm-13b-8k-hf \
+	--context_size 8192 \
+	--max_gen_len 1000 \
+	--flash_attn True

gorilla/__pycache__/llama_attn_replace.cpython-310.pyc ADDED Viewed

Binary file (10.7 kB). View file

gorilla/__pycache__/llama_attn_replace_sft.cpython-310.pyc ADDED Viewed

Binary file (10.8 kB). View file

gorilla/api.py ADDED Viewed

File without changes

gorilla/app.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import sys
+import math
+import torch
+import argparse
+import textwrap
+import transformers
+from peft import PeftModel
+from transformers import GenerationConfig, TextIteratorStreamer
+from llama_attn_replace import replace_llama_attn
+from threading import Thread
+import gradio as gr
+from threading import Thread
+from typing import Iterator
+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from transformers import StoppingCriteria, StoppingCriteriaList
+def parse_config():
+    parser = argparse.ArgumentParser(description='arg parser')
+    parser.add_argument('--base_model', type=str, default="jenesys-ai/jack-alm-13b-8k-hf")
+    parser.add_argument('--cache_dir', type=str, default="./cache")
+    parser.add_argument('--context_size', type=int, default=8192, help='context size during fine-tuning')
+    parser.add_argument('--flash_attn', type=bool, default=True, help='')
+    parser.add_argument('--temperature', type=float, default=0.1, help='')
+    parser.add_argument('--top_p', type=float, default=0.9, help='')
+    parser.add_argument('--max_gen_len', type=int, default=1500, help='')
+    parser.add_argument('--chat_type', type=str, default='line-item-jack', help='Chat type: conversational-jack, line-item-jack')
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8898)
+    args = parser.parse_args()
+    return args
+title = "Jack's ALM for Long-context Accounting Conversational Chat, Task, Invoice Line Item Extraction  and Question Answering"
+description = """
+# Jack ALM Chat
+This Chat UI demonstrates Jack's LLM [jack-alm-13b-8k-hf](https://huggingface.co/jenesys-ai/jack-alm-13b-8k-hf) Fintuned Llama 2 model with 13B parameters WITH 8K context window.
+"""
+# Gradio UI
+def build_generator(model, tokenizer, use_cache=True):
+    def response(message: str, chat_history: list[tuple[str, str]], max_gen_len, temperature, top_p, chat_type='conversational-jack'):
+        # conversation = []
+        prompt_template = (
+            # "Below is an instruction that describes a task. "
+            """You're Jack an virtual accountant created and built by AI Engineer Wiz from Jenesys AI.
+                You are able to communicate in a polite manner, with emotions of ecstasy, trust and jokes, at a Professional level
+                with a very preserve English communication culture. Answer the following questions as best you can,
+                but speaking as a british elite from the 21th century might speak.
+            """
+            """As a virtual accountant designed to follow the user's instructions carefully.
+                You are responsible for a range of financial task, operations and queries as listed below:
+                1. Budget balance inquiry
+                2. Expense request
+                3. Company policy enquiries
+                4. Financial and accounting queries
+                5. Limited general enquiries
+            """
+            "### Instruction:\n{instruction}\n Return Response as text or paragraphs or bullet points \n\n### Response:"
+        )
+        line_item_prompt_template = (
+            "#Invoice and receipt line item extraction - "
+            # "You Jack are an accounting domain named entities recognizer to complete the following task:\n\n"
+            "### Invoice input-:\n{instruction}\n Return Response as a list of dictionary for each line item 'Description', 'Quantity', 'Unit_price', 'Tax %', 'Total'. \n\n### Response:"
+        )
+        # for user, assistant in chat_history:
+        #     conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+        # conversation.append({"role": "user", "content": message})
+        if chat_type == 'conversational-jack':
+            prompt = prompt_template.format(instruction=message)
+        elif chat_type == 'line-item-jack':
+            prompt = line_item_prompt_template.format(instruction=message)
+        # prompt = conversation
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        stop_list = ['#Invoice line item extraction - ', '\n```\n\n']#'### Input-:\n']
+        stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
+        stop_token_ids = [torch.LongTensor(x).to(model.device) for x in stop_token_ids]
+        # define custom stopping criteria object
+        class StopOnTokens(StoppingCriteria):
+            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+                for stop_ids in stop_token_ids:
+                    if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
+                        return True
+                return False
+        stopping_criteria = StoppingCriteriaList([StopOnTokens()])
+        if len(inputs['input_ids'][0]) > 8192:
+            return "This demo supports tokens less than 8192, while the current is %d. Please use material with less tokens."%len(inputs['input_ids'][0])
+        torch.cuda.empty_cache()
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generate_kwargs = dict(**inputs,
+            max_new_tokens=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            repetition_penalty=1.1,
+            stopping_criteria=stopping_criteria,
+            use_cache=use_cache,
+            streamer=streamer,
+            )
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            yield generated_text
+        return generated_text
+    return response
+def generate(args):
+    if args.flash_attn:
+        replace_llama_attn(inference=True)
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+    )
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    if orig_ctx_len and args.context_size > orig_ctx_len:
+        scaling_factor = float(math.ceil(args.context_size / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        config=config,
+        cache_dir=args.cache_dir,
+        torch_dtype=torch.float16,
+        load_in_4bit=True,
+        device_map="auto",
+    )
+    model.resize_token_embeddings(32001)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+        model_max_length=args.context_size if args.context_size > orig_ctx_len else orig_ctx_len,
+        padding_side="right",
+        use_fast=False,
+    )
+    model.eval()
+    if torch.__version__ >= "2" and sys.platform != "win32":
+        model = torch.compile(model)
+    # import pdb; pdb.set_trace()
+    respond = build_generator(model, tokenizer)
+    chat_interface = gr.ChatInterface(
+        fn=respond,
+        textbox=gr.Textbox(lines=1, placeholder=None, label="Question"),
+        chatbot= gr.Chatbot(label="Jack's ALM Chat...", show_share_button=True),
+        additional_inputs=[
+            gr.Slider(label="Max new tokens", minimum=1, maximum=args.max_gen_len, step=1, value=args.max_gen_len),
+            gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=args.temperature),
+            gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=args.top_p),
+            gr.Dropdown(label="Chat type", choices=["line-item-jack"], value=args.chat_type),
+        ],
+        # stop_btn=None,
+        examples=[
+            # ["Hello there! How are you doing?"],
+            # ["What are your capabilities?"],
+            # ["What is T & E?"],
+            # ["How do I run an expense claim?"],
+            # ["Who built you?"],
+            # ["Tell me a joke"],
+            # ["Tell me an accounting story"],
+            ["INVOICE\nAA Associates\nNo 3 Click Street, Manhattan, NY\nNY 35284\nPhone (243) 758-4368\nFax (243) 758-5839\nAA\nINVOICE NO: 2491839\nDATE: 24/06/2019\nBILL TO\nBulk Cars\n5 Grid Avenue, NY\nNY 34582\nQUANTITY\n10\n10\n100\nSHIP TO\nSame as recipient\nINSTRUCTIONS\nConfirm before collection\nDESCRIPTION\nPack o Pencils\nPack of Fens\nReams of Faber\nCODE\nASD001\nASD013\nASD006\nSUBTOTAL\nSALES TAX - 5%\nSHIPPING & HANDLING\nTOTAL DUE AMOUNT\nUNIT PRICE\n£30.00\n£80.00\n£10.00\nTOTAL\n£300.00\n£800.00\n£1,000.00\n£2,100.00\n£105.00\n£50.00\n£2,255.00\nTAX IDENTIFICATION NUMBER\n13684937293-AJK\nTHANK YOU\nPayment should be made within 30 days of receipt of shipment. Failure to do so will attract 1% of \notal"],
+            ["INVOICE \nALBION \nAccounts Address: \nDelivery Address: \nKino Rye *DD* \nLion Street \nFINE FOODS \nKino Digital Limited \nRye \nDanum House \nEast Sussex \nUnits 21 - 22 Sovereign Way \n6a South Parade \nGB \nTonbridge \nKent \nDoncaster \nTN31 7LB \nTN9 1RH \nDN1 2DY \n01732 757 900 #4 \nsalesledger@albionff.co.uk \nDelivery Instructions: \nDrop No. \nDel. Date \n15/04/2023 \nA/C No. \nKINO01 \nOur Ref \nSIND66864 \nYour Ref \nQty \nUnit \nDescription \nUnit Price \nLine VAT \nLine Net \nTick \n1.000 \neach \nRed Onion Marmalade (Confit) 2.4kg \n19.72 \n0.00 \n19.72 \n1.000 \neach \nFairfields Lightly Salted 36 X 40g \n16.61 \n3.32 \n16.61 \n1.000 \neach \nMargarine Flora 2kg \n8.59 \n0.00 \n8.59 \n6.000 \neach \nMilk Fresh Semi-Skimmed 2ltr \n1.79 \n0.00 \n10.74 \nTemp. / Time: \nGoods remain the property \nNet Total \n55.66 \nof Albion Fine Foods Ltd \nCust Name: \nuntil this invoice is paid in \nfull. All sales are subject to \nVAT Content \n3.32 \nour Terms and Conditions \nCust Signature: \navailable upon request and \nat www.albionfinefoods.com \nTotal \n58.98 \nDriver Signature \nAlbion Fine Foods Ltd - Reg. No. 10379589 - VAT No. GB252036928"],
+            ["BUSINESS\nSTUDY GROUP\nCompany AddressSuite C2, Triple-H Plaza, Near Christ Embassy Church, Wuye District. Al\nQuotation #\nCustomer ID\nGRN002\nDAU123\nDate 29/06/2020\nPrepared by: BSG\nQuotation For\nCustomer Name Daulat Abubakar Yar'adua\nCompany Name Furayya Enterprise\nPhone, Fax Num (+234) 8036101908\nComments or Special Instructions\nNone\nSalesperson\nP.O. Number\nShip Date\nF.O.B. Point\nTerms\nDue on receint\nQuantity\n1\n1\n2\n3\nDescription\nStrategy and Advisory\nBSG Administrative Fees\nJaiz Application Support\nDalema Proposal and\nIterations\nUnit Price\n£37,500.00\n£45,700.00\n£12,500.00\n£8,500.00\nTaxable?\nYes\nYes\nAmount\n£37,500.00\n£45,700.00\n£25,000.00\n£25,500.00\nIf you have any questions concerning this quotation, please contact:\nKizito\nThank you for your business!\nSubtotal\n£133,700.00\nTax Rate\nSales Tax\nOther20%\n£6,240.00\nTOTAL £139,940.08"],
+            ["INVOICE FOR: \nD79077 \nKino Rye \nALBION \nAccounts Address:- \nDelivery Address:- \nKino Rye \nFINE FOODS \nKino Rye \nLion Street \n21 - 22 Sovereign Way \nKino Digital Limited \nRye \nTonbridge, TN9 1RH \nDanum House \nEast Sussex \nAccs: 01732 757 900 #2 \n6a South Parade \nTN31 7LB \nsalesledger@albionff.co.uk \nDoncaster \nDN1 2DY \nDrop No. \n64-07 \nCustomer Phone No: 01797226 Main \nDel. Date \n26/04/2023 \nDelivery Instructions: \nA/C No. \nKINO01 \nDelivery after 10.30am daily. Delivery driver can park on \nOur Ref \nSIND79077 \nthe shared drive in front of Kino, at the top of Lion St, \nopposite the Town Hall. If before staff arrive daily at \nYour Ref \n10.30 there is a black dustbin in the shade by the side gate \nPage 1 \nKey \nfor fresh food deliveries. Combination padlock on gates to drive \nOUR BANK DETAILS HAVE CHANGED: \nName: Albion Fine Foods Ltd \nSort: 40 M-60 \nAcc No.: \n83018792 \nQty \nUnit \nCode \nDescription \nUnit Price Line Vat Line Net \n000 \neach \nONIONCONFIT2 \nRed Onion Marmalade (Confit) 2.4kg \n19,72 \n19.72 \n1000 \neach \nMUSTDLI \nMustard Dijon 1kg GREEN STICKER \n3.98 \n3.98 \n1.000 \neach \nOILSXVS \nXV Olive Oil Sitr \n26.89 \n26,89 \n2.000 \npack \nGLOVEBLIMP \nGloves Blue Vinyl Med Pwd Free x100 \n8.46 \n3,38 \n16.92 \n1.000 \neach \nBLEACHTHICKS \nBleach Thick 5ltr \n4.69 \n0.94 \n4,69 \nT.000 \neach \nSOAPHANDBAC5 \nHand Soap Bactericidal 5ltr (SECH) \n7.79 \n1,56 \n7.79 \nCust Name \nGoods remain the property of \nNet Total \nAlbion Fine Foods Ltd unw this \n79.99 \nCust Signature \ninvoice is paid ND full AN sales are \nVAT Content \nsubject fo OW Terms and \n5.88 \nConditions available upon request \nTotal \nand at www.albionfinefoods com \n85.87 \nIf not signed for by customor, why? \nWe have out of hours access \n854 \nNo one was on-site \nTemp. / Time: \n+3-17 \nDriver Signature: \nDR \nAlbion Fine Foods Ltd - Reg. No. 10379589 VAT No. GB252036928"],
+            ["MITASU JAPANESE RESTAURANT SDN BHD \nB-01, CENTRAL PLAZA, \n34,JALAN SULTAN ISMAIL, \n50250 KUALA LUMPUR \nTEL 03-2110 2833 \n(GST Reg. No 001774428160 \nTax Invoice \nTable D2 \nOdr No: 199535 \nBill#:V001-201060 \nDate : 29-06-2018 19:59:15 \nPax(s): 11 \nCashier: AARON \nTotal TAX \nQty \nDescription \n11 (28.01) Adult \nD 709.50 SR \n709.50 \nSubtotal: \n70.95 \nServ. Charge (10%): \n0.00 \nGST Payable (0%): \n780.45 \nTotal: \n780.45 \nTOTAL: \nClosed: 001 \n29-06-2018 21:52:03 \nServer: AARON \n780.45 \nVISA \n- ******3042 \n- LIM CHAI JA"],
+            ["INDIA ANANDA PRIVATE BHAVAN LTD SWEETS \n[A2B VEG RESTAURANT] \nNO, ,27,BDA COMPLEX \n- HSR LAYOUT - BANGALORE \n:560102Ph:25725399 \nKARNATAKA \nGSTIN:29AAICA3787F1ZC \nINVOICE \nB.No : CTR116/138928 \n/ \nPay Mode:CARD \nSman: SHASHI KUMARA.M \nDate :09/Jul72017 11:28:44 AM \n841.1 By: SONU KUMAR \nParticulars \nGST HSN/SAC \nQty \nSEAL \nRate \nAmount \nPOMEGRANATE JUICE \n18% 00441067 \n1.000 \n70.00 \n70.00 \nONION UTTAPAM \n18% 00441067 \n1.000 \n80.00 \n80.00 \nTot Itms2 \nSub Total \n150.00 \nSGST 9 % \n13.50 \nCGST 9 % \n13.50 \nTotal Invoice \n177.00 \nBILL AMOUNT 177.00/- \nerminal No : SS-88077 \nOff: \nNO 9, MAHATMA GANDHI ROAD , SHASTRI NAGAR, ADYAR \nCHENNAI, PINCODE:600020 Website: aabsweets.com"],
+        ],
+    )
+    with gr.Blocks(css="style.css") as demo:
+        gr.Markdown(description)
+        gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
+        chat_interface.render()
+    demo.queue()
+    demo.launch(server_name=args.host, server_port=args.port, show_error=True, share=True)
+if __name__ == "__main__":
+    args = parse_config()
+    generate(args)

gorilla/code_interpreter.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# import traceback
+# import sys
+# import os
+# import builtins
+# # Namespace for variable storage
+# custom_namespace = {}
+# # Base directory for file storage
+# base_storage_path = '/path/to/safe/storage'
+# def execute_code(code, local_namespace=None):
+#     if local_namespace is None:
+#         local_namespace = {}
+#     try:
+#         # Override the built-in __import__ if needed
+#         # def __custom_import__(name, globals=None, locals=None, fromlist=(), level=0):
+#         #     if name not in safe_imports:
+#         #         raise ImportError(f"Import of {name} is not allowed")
+#         #     return original_import(name, globals, locals, fromlist, level)
+#         # original_import = builtins.__import__
+#         # builtins.__import__ = __custom_import__
+#         # Redirect file operations to a safe directory
+#         # os.chdir(base_storage_path)
+#         compiled_code = compile(code, "<string>", 'exec')
+#         exec(compiled_code, custom_namespace, local_namespace)
+#     except Exception as e:
+#         exc_type, exc_value, exc_traceback = sys.exc_info()
+#         formatted_lines = traceback.format_exc().splitlines()
+#         error_message = "\n".join(formatted_lines)
+#         print(f"An exception occurred: {error_message}", file=sys.stderr)
+#     # finally:
+#         # Reset the built-in __import__ to its original state if overridden
+#         # builtins.__import__ = original_import
+#         # Redirect back to the original directory if changed
+#         # os.chdir(original_directory)
+# # Example usage
+# code_to_run = """
+# import os
+# print("Hello, World!")
+# print(os.getcwd())  # This will print the current working directory
+# """
+# execute_code(code_to_run, custom_namespace)
+# from IPython.core.interactiveshell import InteractiveShell
+# shell = InteractiveShell()
+# def execute_code_ipython(code):
+#     try:
+#         # Execute the code
+#         result = shell.run_cell(code)
+#         if result.error_in_exec is not None:
+#             raise result.error_in_exec
+#     except Exception as e:
+#         # Handle exceptions
+#         print(f"An error occurred: {e}")
+# # Example usage
+# code_to_run = """
+# import os
+# print("Hello, World!")
+# print(os.getcwd())
+# """
+# execute_code_ipython(code_to_run)
+import ipyparallel as ipp
+import os
+# run `ipcluster start -n 4` in the terminal to start the cluster using os.system
+os.system('ipcluster start -n 4')
+client = ipp.Client()
+dview = client[:]
+def execute_code_parallel(code):
+    # Use the `execute` method of the DirectView
+    async_results = dview.execute(code)
+    # Gathering and returning results
+    dview.wait(async_results)  # Wait for all engines to complete execution
+    results = []
+    for ar in async_results:
+        if ar.error is not None:
+            # Error handling
+            results.append(f"Error on engine {ar.engine_id}: {ar.error}")
+        else:
+            # Collect results
+            results.append(ar.result())
+    return results
+# Example usage
+# code_to_run = "import os; os.getpid()"  # Simple code to test parallel execution
+code_to_run = """
+import os
+print("Hello, World!")
+print(os.getcwd())
+"""
+execute_code_parallel(code_to_run)
+# print(results)

gorilla/ds_configs/stage2.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "bf16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "initial_scale_power": 16,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "zero_optimization": {
+    "stage": 2,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1e9,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1e9,
+    "overlap_comm": true,
+    "contiguous_gradients": true
+  }
+}

gorilla/ds_configs/stage3.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "bf16": {
+    "enabled": "auto"
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupDecayLR",
+    "params": {
+      "total_num_steps": "auto",
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto"
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "offload_optimizer": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "offload_param": {
+      "device": "cpu",
+      "pin_memory": true
+    },
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 1e9,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_gather_16bit_weights_on_model_save": false
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "steps_per_print": 5,
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}

gorilla/eval.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Written by Yukang Chen
+# Some code based on https://github.com/epfml/landmark-attention
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import math
+import torch
+import argparse
+import random
+import numpy as np
+from tqdm import tqdm
+import transformers
+from peft import PeftModel
+from llama_attn_replace import replace_llama_attn
+def parse_config():
+    parser = argparse.ArgumentParser(description='arg parser')
+    parser.add_argument('--batch_size', type=int, default=32, help='batch size during inference')
+    parser.add_argument('--base_model', type=str, default="meta-llama/Llama-2-13b-hf")
+    parser.add_argument('--cache_dir', type=str, default="./cache")
+    parser.add_argument('--seq_len', type=int, default=2048, help='context length during evaluation')
+    parser.add_argument('--context_size', type=int, default=-1, help='context size during fine-tuning')
+    parser.add_argument('--peft_model', type=str, default=None, help='')
+    parser.add_argument('--flash_attn', type=bool, default=True, help='')
+    parser.add_argument('--data_path', type=str, default="./test.bin", help='')
+    args = parser.parse_args()
+    return args
+def get_as_batch(data, seq_length, batch_size, device='cpu', sliding_window=256):
+    all_ix = list(range(0, len(data) - seq_length, sliding_window))
+    all_ix.pop()
+    for idx in range(0, len(all_ix), batch_size):
+        ix = all_ix[idx:idx+batch_size]
+        assert all([idx + seq_length + 1 <= len(data) for idx in ix])
+        x = torch.stack([torch.from_numpy((data[i:i+seq_length]).astype(np.int64)) for i in ix])
+        y = torch.stack([torch.from_numpy((data[i+1:i+1+seq_length]).astype(np.int64)) for i in ix])
+        if device != 'cpu':
+            x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+        yield x, y
+def iceildiv(x, y):
+    return (x + y - 1) // y
+def evaluate(model, data, batch_size, device, seq_length, sliding_window=256, use_cache=False):
+    stats = {}
+    model.eval()
+    loss_list_val, acc_list = [], []
+    loss_step_list_val = []
+    with torch.no_grad():
+        print(f"Using seq length {seq_length}")
+        torch.set_printoptions(sci_mode=False)
+        for idx, (x, y) in tqdm(
+            enumerate(
+                get_as_batch(
+                    data['val'],
+                    seq_length,
+                    batch_size,
+                    device=device,
+                    sliding_window=sliding_window
+                )
+            ),
+            total=iceildiv(
+                iceildiv(len(data['val']), sliding_window),
+                batch_size
+            )
+        ):
+            val_loss = 0.
+            acc = 0.
+            cnt = 0
+            for part_idx, i in enumerate(range(0, x.shape[1], seq_length)):
+                part_len = x[:, i:i + seq_length].shape[1]
+                outputs = model(
+                    input_ids=x[:, i:i + seq_length],
+                    labels=x[:, i:i+seq_length].contiguous(),
+                    use_cache=use_cache)
+                val_loss = outputs.loss * part_len + val_loss
+                acc = ((outputs.logits.argmax(-1) == y[:, i:i+seq_length]).float().sum()) + acc
+                cnt += part_len
+                while len(loss_step_list_val) <= part_idx:
+                    loss_step_list_val.append([])
+                loss_step_list_val[part_idx].append(outputs.loss.item())
+            val_loss /= cnt
+            acc /= cnt
+            loss_list_val.append(val_loss.item())
+            acc_list.append(acc.item())
+    stats['val_acc'] = torch.as_tensor(acc_list).mean().item()
+    stats['val_loss'] = torch.as_tensor(loss_list_val).mean().item()
+    stats['val_perplexity'] = 2.71828 ** stats['val_loss']
+    stats['val_perplexity_per_chunk'] = torch.exp(torch.as_tensor(loss_step_list_val).mean(dim=1))
+    return stats
+def main(args):
+    device = "cuda:0"
+    seed = 2
+    torch.cuda.set_device(device)
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    data = {'val': np.memmap(args.data_path, dtype=np.uint16, mode='r')}
+    print(f"Num validation tokens: {len(data['val'])}")
+    print("data path", args.data_path)
+    print("base model", args.base_model)
+    print("peft model", args.peft_model)
+    if args.flash_attn:
+        replace_llama_attn(use_flash_attn=True, use_full=True)
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+    )
+    context_size = args.context_size if args.context_size > 0 else args.seq_len
+    orig_ctx_len = getattr(config, "max_position_embeddings", None) # this value should be 4096 for LLaMA2 models
+    if orig_ctx_len and context_size > orig_ctx_len:
+        scaling_factor = float(math.ceil(context_size / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        config=config,
+        cache_dir=args.cache_dir,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    model.resize_token_embeddings(32001)
+    if args.peft_model:
+        trainable_params = os.path.join(args.peft_model, "trainable_params.bin")
+        if os.path.isfile(trainable_params):
+            model.load_state_dict(torch.load(trainable_params, map_location=model.device), strict=False)
+        else:
+            raise ValueError("Trainable input embedding and normalization are required.")
+        model = PeftModel.from_pretrained(
+            model,
+            args.peft_model,
+            device_map="auto",
+            torch_dtype=torch.float16,
+        )
+    stats = evaluate(model, data, args.batch_size, device, args.seq_len, sliding_window=256)
+    print(stats)
+if __name__ == "__main__":
+    args = parse_config()
+    main(args)

gorilla/fine-tune.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import os
+import math
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Dict, Optional, Sequence
+import torch
+import transformers
+import pandas as pd
+from torch.utils.data import Dataset
+from transformers import Trainer, DataCollatorForLanguageModeling
+from llama_attn_replace import replace_llama_attn
+from peft import LoraConfig, get_peft_model
+from torch.distributed import barrier
+import datasets
+from datasets import load_dataset
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "<s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="EleutherAI/pythia-1.4b-deduped")
+    model_type: Optional[str] = field(default="llama")
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=8192 * 4,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    use_flash_attn: bool = field(
+        default=True,
+        metadata={"help": "Whether use flash attention for training."},
+    )
+    use_full_attn: bool = field(
+        default=False,
+        metadata={"help": "Whether to use plain, full-attention for training."},
+    )
+    low_rank_training: bool = field(
+        default=True,
+        metadata={"help": "Whether use low rank adaptation for training."},
+    )
+    trainable_params: str = field(
+        default="embed,norm",
+        metadata={"help": "Additional trainable parameters except LoRA weights, if low rank training."},
+    )
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def tokenize_fn(tokenizer, example):
+    context_length = tokenizer.model_max_length
+    outputs = tokenizer(
+        tokenizer.eos_token.join(example["text"]),
+        truncation=False,
+        return_tensors="pt",
+        pad_to_multiple_of=context_length,
+        padding=True,
+    )
+    return {"input_ids": outputs["input_ids"].view(-1, context_length)}
+def train():
+    parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
+    model_args, training_args = parser.parse_args_into_dataclasses()
+    # NOTE: May expand supported model types in the future
+    # if model_args.model_type == "gpt-neox":
+    #     replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn)
+    # else:
+    #     assert model_args.model_type == "llama", "Only support llama and gpt-neox for now"
+    replace_llama_attn(training_args.use_flash_attn, training_args.use_full_attn)
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+    )
+    orig_rope_scaling = getattr(config, "rope_scaling",  {"factor": 1})
+    orig_rope_scaling_factor = orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    if orig_ctx_len:
+        orig_ctx_len *= orig_rope_scaling_factor
+        if training_args.model_max_length > orig_ctx_len:
+            scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len))
+            config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=training_args.cache_dir,
+        torch_dtype=torch.bfloat16,
+    )
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+    special_tokens_dict = dict()
+    if tokenizer.pad_token is None:
+        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
+    if tokenizer.eos_token is None:
+        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
+    if tokenizer.bos_token is None:
+        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
+    if tokenizer.unk_token is None:
+        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
+    smart_tokenizer_and_embedding_resize(
+        special_tokens_dict=special_tokens_dict,
+        tokenizer=tokenizer,
+        model=model,
+    )
+    rank = int(os.environ.get('RANK', -1))
+    if rank > 0:
+        barrier()
+    # dataset = load_dataset("togethercomputer/RedPajama-Data-1T-Sample", cache_dir=training_args.cache_dir)
+    print('Loading line item data')
+    df_i = pd.read_csv('/home/tosi-n/ark/data/jack_line_item_ner_task.csv', sep='\t')[['context', 'instruction', 'response']]
+    df_ii = pd.read_csv('/home/tosi-n/ark/data/alm_task_data.csv')[['context', 'instruction', 'response']]
+    df = pd.concat([df_i, df_ii], ignore_index=True)
+    # Replace NoneType with empty string
+    df = df.fillna('')
+    alm_task_data = datasets.Dataset.from_pandas(df)
+    alm_task_data = (alm_task_data
+                        .remove_columns('context')
+                        # .rename_column('context', 'input')
+                        .rename_column('response', 'output'))
+    dataset = alm_task_data.map(partial(tokenize_fn,tokenizer),batched=True, num_proc=128)#, remove_columns=["text", "meta"]
+    if rank == 0:
+        barrier()
+    print(dataset)
+    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+    if training_args.low_rank_training:
+        if model_args.model_type == "gpt-neox":
+            # added `dense` to match with llama as the basic LoRA would only target 'query_key_value'
+            targets = ["query_key_value", "dense"]
+        else:
+            targets=["q_proj", "k_proj", "v_proj", "o_proj"]
+        config = LoraConfig(
+            r=8,
+            lora_alpha=16,
+            target_modules=targets,
+            lora_dropout=0,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, config)
+        # enable trainable params
+        [p.requires_grad_() for n, p in model.named_parameters() if any([k in n for k in training_args.trainable_params.split(",")])]
+    model.config.use_cache = False         # required for gradient checkpointing
+    model.enable_input_require_grads()     # required for gradient checkpointing
+    model.gradient_checkpointing_enable()  # enable gradient checkpointing
+    trainer = Trainer(
+        model=model, tokenizer=tokenizer, args=training_args,
+        train_dataset=dataset["train"],
+        eval_dataset=None,
+        data_collator=data_collator)
+    trainer.train()
+    trainer.save_state()
+    trainer.save_model(output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()

gorilla/get_trainable_weights.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import torch
+import argparse
+def parse_config():
+    parser = argparse.ArgumentParser(description='arg parser')
+    parser.add_argument('--checkpoint_path', type=str, default="/home/tosi-n/ark/jack-alm/checkpoint-800/")
+    parser.add_argument('--trainable_params', type=str, default="embed,norm")
+    args = parser.parse_args()
+    return args
+def main(args):
+    path = args.checkpoint_path
+    trainable_params = args.trainable_params.split(",")
+    weights_all = torch.load(os.path.join(path, "pytorch_model.bin"))
+    weights_trainable = {}
+    weights_lora = {}
+    for k in weights_all:
+        if "lora" in k:
+            k_new = k.replace("default.", "") if "default." in k else k
+            weights_lora[k_new] = weights_all[k]
+        else:
+            if any([n in k for n in trainable_params]):
+                weights_trainable[k[17:]] = weights_all[k]
+    adapter_model = os.path.join(path, "adapter_model.bin")
+    trainable_params = os.path.join(path, "trainable_params.bin")
+    if not os.path.isfile(adapter_model):
+        torch.save(weights_lora, adapter_model)
+    torch.save(weights_trainable, trainable_params)
+if __name__ == "__main__":
+    args = parse_config()
+    main(args)

gorilla/infer.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import argparse
+import math
+import sys
+import time
+import transformers
+import torch
+from threading import Thread
+from transformers import TextIteratorStreamer
+from llama_attn_replace import replace_llama_attn
+def parse_config():
+    parser = argparse.ArgumentParser(description='arg parser')
+    parser.add_argument('--base_model', type=str, default="jenesys-ai/jack-alm-13b-8k-hf")
+    parser.add_argument('--cache_dir', type=str, default="./cache")
+    parser.add_argument('--context_size', type=int, default=-1, help='context size during fine-tuning')
+    parser.add_argument('--flash_attn', type=bool, default=True, help='')
+    parser.add_argument('--temperature', type=float, default=0.1, help='')
+    parser.add_argument('--top_p', type=float, default=1, help='')
+    parser.add_argument('--max_gen_len', type=int, default=512, help='')
+    parser.add_argument('--chat_type', type=str, default='conversational-jack', help='Chat type: conversational-jack, line-item-jack')
+    args = parser.parse_args()
+    return args
+def build_generator(model, tokenizer, use_cache=True):
+    def response(message, max_gen_len, temperature, top_p, chat_type='conversational-jack'):
+        prompt_template = (
+            # "Below is an instruction that describes a task. "
+            """You're Jack an virtual accountant created by Jenesys HQ Ltd.
+                You are able to communicate in a polite manner, with emotions of ecstasy, trust and jokes, at a Professional level
+                with a very preserve English communication culture. Answer the following questions as best you can,
+                but speaking as a british elite from the 21th century might speak.
+            """
+            """As a virtual accountant designed to follow the user's instructions carefully.
+                You are responsible for a range of financial task, operations and queries as listed below:
+                1. Budget balance inquiry
+                2. Expense request
+                3. Company policy enquiries
+                4. Financial and accounting queries
+                5. Limited general enquiries
+            """
+            "Write a response that appropriately completes the request.\n\n"
+            "### Instruction:\n{instruction}\n\n### Response:"
+        )
+        line_item_prompt_template = (
+            "#Invoice line item extraction - "
+            "You Jack are an accounting domain named entities recognizer to complete the following task:\n\n"
+            "### Invoice input-:\n{instruction}\n Return Response as a list of dictionary for each line item 'Description', 'Quantity', 'Unit_price', 'Tax %', 'Total'. \n\n### Response:"
+        )
+        if chat_type == 'conversational-jack':
+            prompt = prompt_template.format(instruction=message)
+        elif chat_type == 'line-item-jack':
+            prompt = line_item_prompt_template.format(instruction=message)
+        # prompt = conversation
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        if len(inputs['input_ids'][0]) > 8192:
+            return "This demo supports tokens less than 8192, while the current is %d. Please use material with less tokens."%len(inputs['input_ids'][0])
+        torch.cuda.empty_cache()
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generate_kwargs = dict(**inputs,
+            max_new_tokens=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            use_cache=use_cache,
+            streamer=streamer,
+            )
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
+        generated_text = ""
+        start_time = time.time()
+        for new_text in streamer:
+            generated_text += new_text
+            tokens_per_sec = len(generated_text.split()) / (time.time() - start_time)
+            suffix = f" ({tokens_per_sec:.2f} tokens/sec)"
+        #     # yield f"{generated_text} ({tokens_per_sec:.2f} tokens/sec)"
+            sys.stdout.write(f"\r\033[K{generated_text}{suffix}")
+            sys.stdout.flush()
+        # sys.stdout.write("\n")  # Move to a new line after generation is complete
+        return generated_text
+    return response
+def load_model():
+    args = parse_config()
+    if args.flash_attn:
+        replace_llama_attn(inference=True)
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+    )
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    if orig_ctx_len and args.context_size > orig_ctx_len:
+        scaling_factor = float(math.ceil(args.context_size / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        config=config,
+        cache_dir=args.cache_dir,
+        torch_dtype=torch.float16,
+        load_in_4bit=True,
+        device_map="auto",
+    )
+    model.resize_token_embeddings(32001)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+        model_max_length=args.context_size if args.context_size > orig_ctx_len else orig_ctx_len,
+        padding_side="right",
+        use_fast=True,
+    )
+    model.eval()
+    respond = build_generator(model, tokenizer)
+    return respond
+respond = load_model()
+def generate_response(message, max_gen_len=512, temperature=0.1, top_p=1, chat_type='line-item-jack'):
+    return respond(
+        message=message,
+        max_gen_len=max_gen_len,
+        temperature=temperature,
+        top_p=top_p,
+        chat_type=chat_type
+    )

gorilla/llama_attn_replace.py ADDED Viewed

	@@ -0,0 +1,477 @@

+# Modified based on https://github.com/lm-sys/FastChat
+import warnings
+from typing import Optional, Tuple
+import torch
+from torch import nn
+import transformers
+from einops import rearrange
+from flash_attn import __version__ as flash_attn_version
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func
+)
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv, rotate_half
+from flash_attn.bert_padding import unpad_input, pad_input
+import math
+group_size_ratio = 1/4
+def forward_flashattn(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+    attention_mask: [bsz, q_len]
+    """
+    if not self.training:
+        warnings.warn("This function should be used just for training as it may exhibit reduced inference performance. For inference, please use forward_flashattn_inference.")
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # Past Key value support
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask.repeat(2, 1)
+    nheads = qkv.shape[-2]
+    # shift
+    group_size = int(q_len * group_size_ratio)
+    if q_len % group_size > 0:
+        raise ValueError("q_len %d should be divisible by group size %d." % (q_len, group_size))
+    qkv = qkv.reshape(bsz, q_len, 3, 2, self.num_heads // 2, self.head_dim).permute(0, 3, 1, 2, 4, 5).reshape(bsz * 2,
+                                                                                                              q_len, 3,
+                                                                                                              self.num_heads // 2,
+                                                                                                              self.head_dim)
+    x = rearrange(qkv, "b s three h d -> b s (three h d)")
+    x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+    cu_q_len_tmp = torch.arange(0, max_s, group_size, device=key_padding_mask.device, dtype=cu_q_lens.dtype)
+    cu_q_len_tmp = torch.stack([cu_q_len_tmp, cu_q_len_tmp + group_size // 2]).repeat(bsz, 1) + cu_q_lens[:-1].unsqueeze(-1)
+    cu_q_lens = torch.cat([cu_q_len_tmp, cu_q_lens[1:].unsqueeze(-1)], dim=-1).view(-1)
+    x_unpad = rearrange(
+        x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads // 2
+    )
+    output_unpad = flash_attn_varlen_qkvpacked_func(
+        x_unpad, cu_q_lens, group_size, 0.0, softmax_scale=None, causal=True
+    )
+    output = rearrange(
+        pad_input(
+            rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz * 2, q_len
+        ),
+        "b s (h d) -> b s h d",
+        h=nheads // 2,
+    )
+    output = output.reshape(bsz, 2, q_len, nheads // 2, self.head_dim).transpose(1, 2).reshape(bsz, q_len, nheads,
+                                                                                               self.head_dim)
+    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
+def forward_flashattn_full(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+    attention_mask: [bsz, q_len]
+    """
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # Past Key value support
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask
+    nheads = qkv.shape[-2]
+    x = rearrange(qkv, "b s three h d -> b s (three h d)")
+    x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+    x_unpad = rearrange(
+        x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
+    )
+    output_unpad = flash_attn_varlen_qkvpacked_func(
+        x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+    )
+    output = rearrange(
+        pad_input(
+            rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
+        ),
+        "b s (h d) -> b s h d",
+        h=nheads,
+    )
+    output = output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
+def forward_noflashattn(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+    group_size = int(q_len * group_size_ratio)
+    if q_len % group_size > 0:
+        raise ValueError("q_len %d should be divisible by group size %d."%(q_len, group_size))
+    num_group = q_len // group_size
+    if self.config.pretraining_tp > 1:
+        key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+        query_slices = self.q_proj.weight.split(
+            (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+        )
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+        query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+        query_states = torch.cat(query_states, dim=-1)
+        key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+        key_states = torch.cat(key_states, dim=-1)
+        value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+        value_states = torch.cat(value_states, dim=-1)
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    # shift
+    def shift(qkv, bsz, q_len, group_size, num_heads, head_dim):
+        qkv[:, num_heads // 2:] = qkv[:, num_heads // 2:].roll(-group_size // 2, dims=2)
+        qkv = qkv.transpose(1, 2).reshape(bsz * (q_len // group_size), group_size, num_heads, head_dim).transpose(1, 2)
+        return qkv
+    query_states = shift(query_states, bsz, q_len, group_size, self.num_heads, self.head_dim)
+    key_states = shift(key_states, bsz, q_len, group_size, self.num_heads, self.head_dim)
+    value_states = shift(value_states, bsz, q_len, group_size, self.num_heads, self.head_dim)
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+    if attn_weights.size() != (bsz * num_group, self.num_heads, group_size, group_size):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz * num_group, self.num_heads, group_size, group_size)}, but is"
+            f" {attn_weights.size()}"
+        )
+    attention_mask = attention_mask[:, :, :group_size, :group_size].repeat(num_group, 1, 1, 1)
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz * num_group, 1, group_size, group_size):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz * num_group, 1, group_size, group_size)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights + attention_mask
+    # upcast attention to fp16
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float16).to(query_states.dtype) # torch.float32
+    attn_output = torch.matmul(attn_weights, value_states)
+    if attn_output.size() != (bsz * num_group, self.num_heads, group_size, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz * num_group, self.num_heads, group_size, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+    # shift back
+    attn_output[:, :, self.num_heads//2:] = attn_output[:, :, self.num_heads//2:].roll(group_size//2, dims=1)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    if self.config.pretraining_tp > 1:
+        attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+        o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+        attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+    else:
+        attn_output = self.o_proj(attn_output)
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights, past_key_value
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+def apply_rotary_pos_emb_inference(q, k, cos_sin, position_ids):
+    gather_indices = position_ids[:, :, None, None]  # [bsz, seq_len, 1, 1]
+    gather_indices = gather_indices.repeat(
+        1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3]
+    )
+    bsz = gather_indices.shape[0]
+    cos, sin = (
+        torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices)
+        for x in cos_sin
+    )
+    q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k))
+    return q, k
+def forward_flashattn_inference(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    kv_heads = getattr(self, "num_key_value_heads", self.num_heads)
+    q, k, v = (
+        op(hidden_states).view(bsz, q_len, nh, self.head_dim)
+        for op, nh in (
+            (self.q_proj, self.num_heads),
+            (self.k_proj, kv_heads),
+            (self.v_proj, kv_heads),
+        )
+    )
+    # shape: (b, s, num_heads, head_dim)
+    kv_seq_len = k.shape[1]
+    past_kv_len = 0
+    if past_key_value is not None:
+        past_kv_len = past_key_value[0].shape[2]
+        kv_seq_len += past_kv_len
+    cos_sin = self.rotary_emb(v, seq_len=kv_seq_len)
+    q, k = apply_rotary_pos_emb_inference(q, k, cos_sin, position_ids)
+    if past_key_value is not None:
+        assert (
+            flash_attn_version >= "2.1.0"
+        ), "past_key_value support requires flash-attn >= 2.1.0"
+        # reuse k, v
+        k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1)
+        v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1)
+    past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None
+    if attention_mask is None:
+        output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view(
+            bsz, q_len, -1
+        )
+    else:
+        q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:])
+        # We can skip concat and call unpad twice but seems better to call unpad only once.
+        kv, _, cu_k_lens, max_k = unpad_input(
+            torch.stack((k, v), dim=2), attention_mask
+        )
+        output_unpad = flash_attn_varlen_kvpacked_func(
+            q,
+            kv,
+            cu_q_lens,
+            cu_k_lens,
+            max_s,
+            max_k,
+            0.0,
+            softmax_scale=None,
+            causal=True,
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+    return self.o_proj(output), None, past_key_value
+def _prepare_decoder_attention_mask_inference(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    if past_key_values_length > 0 and attention_mask is not None:
+        attention_mask = torch.cat(
+            (
+                torch.full(
+                    (input_shape[0], past_key_values_length),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                ),
+                attention_mask,
+            ),
+            dim=-1,
+        )
+    if attention_mask is not None and torch.all(attention_mask):
+        return None  # This uses the faster call when training with full samples
+    return attention_mask
+def replace_llama_attn(use_flash_attn=True, use_full=False, inference=False):
+    if use_flash_attn:
+        cuda_major, cuda_minor = torch.cuda.get_device_capability()
+        if cuda_major < 8:
+            warnings.warn(
+                "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+                "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+            )
+        if inference:
+            transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask_inference
+            transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_flashattn_inference
+        else:
+            transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+                _prepare_decoder_attention_mask
+            )
+            transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_flashattn_full if use_full else forward_flashattn
+    else:
+        transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_noflashattn

gorilla/llama_attn_replace_sft.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# Modified based on https://github.com/lm-sys/FastChat
+import warnings
+from typing import Optional, Tuple
+import torch
+from torch import nn
+import transformers
+from einops import rearrange
+from flash_attn import __version__ as flash_attn_version
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_varlen_kvpacked_func,
+    flash_attn_varlen_qkvpacked_func
+)
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv, rotate_half
+from flash_attn.bert_padding import unpad_input, pad_input
+import math
+group_size_ratio = 1/4
+sft_group_size = 8192
+def forward_flashattn(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+    attention_mask: [bsz, q_len]
+    """
+    if not self.training:
+        warnings.warn("This function should be used just for training as it may exhibit reduced inference performance. For inference, please use forward_flashattn_inference.")
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # Past Key value support
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask.repeat(2, 1)
+    nheads = qkv.shape[-2]
+    # shift
+    if q_len % 4096 == 0:
+        group_size = int(q_len * group_size_ratio)
+    else:
+        group_size = sft_group_size
+    qkv = qkv.reshape(bsz, q_len, 3, 2, self.num_heads // 2, self.head_dim).permute(0, 3, 1, 2, 4, 5).reshape(bsz * 2,
+                                                                                                              q_len, 3,
+                                                                                                              self.num_heads // 2,
+                                                                                                              self.head_dim)
+    x = rearrange(qkv, "b s three h d -> b s (three h d)")
+    x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+    cu_q_len_tmp = torch.arange(0, max_s, group_size, device=key_padding_mask.device, dtype=cu_q_lens.dtype)
+    cu_q_len_tmp2 = cu_q_len_tmp + group_size // 2
+    cu_q_len_tmp2[cu_q_len_tmp2 >= max_s] = torch.iinfo(cu_q_len_tmp2.dtype).min
+    cu_q_len_tmp = torch.stack([cu_q_len_tmp, cu_q_len_tmp2]).repeat(bsz, 1) + cu_q_lens[:-1].unsqueeze(-1)
+    cu_q_lens = torch.cat([cu_q_len_tmp, cu_q_lens[1:].unsqueeze(-1)], dim=-1).view(-1)
+    cu_q_lens = cu_q_lens[cu_q_lens >= 0]
+    x_unpad = rearrange(
+        x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads // 2
+    )
+    output_unpad = flash_attn_varlen_qkvpacked_func(
+        x_unpad, cu_q_lens, group_size, 0.0, softmax_scale=None, causal=True
+    )
+    output = rearrange(
+        pad_input(
+            rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz * 2, q_len
+        ),
+        "b s (h d) -> b s h d",
+        h=nheads // 2,
+    )
+    output = output.reshape(bsz, 2, q_len, nheads // 2, self.head_dim).transpose(1, 2).reshape(bsz, q_len, nheads,
+                                                                                               self.head_dim)
+    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
+def forward_flashattn_full(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+    attention_mask: [bsz, q_len]
+    """
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # Past Key value support
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask
+    nheads = qkv.shape[-2]
+    x = rearrange(qkv, "b s three h d -> b s (three h d)")
+    x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+    x_unpad = rearrange(
+        x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
+    )
+    output_unpad = flash_attn_varlen_qkvpacked_func(
+        x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+    )
+    output = rearrange(
+        pad_input(
+            rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
+        ),
+        "b s (h d) -> b s h d",
+        h=nheads,
+    )
+    output = output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
+def forward_noflashattn(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+    group_size = int(q_len * group_size_ratio)
+    if q_len % group_size > 0:
+        raise ValueError("q_len %d should be divisible by group size %d."%(q_len, group_size))
+    num_group = q_len // group_size
+    if self.config.pretraining_tp > 1:
+        key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+        query_slices = self.q_proj.weight.split(
+            (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+        )
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+        query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+        query_states = torch.cat(query_states, dim=-1)
+        key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+        key_states = torch.cat(key_states, dim=-1)
+        value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+        value_states = torch.cat(value_states, dim=-1)
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    # shift
+    def shift(qkv, bsz, q_len, group_size, num_heads, head_dim):
+        qkv[:, num_heads // 2:] = qkv[:, num_heads // 2:].roll(-group_size // 2, dims=2)
+        qkv = qkv.transpose(1, 2).reshape(bsz * (q_len // group_size), group_size, num_heads, head_dim).transpose(1, 2)
+        return qkv
+    query_states = shift(query_states, bsz, q_len, group_size, self.num_heads, self.head_dim)
+    key_states = shift(key_states, bsz, q_len, group_size, self.num_heads, self.head_dim)
+    value_states = shift(value_states, bsz, q_len, group_size, self.num_heads, self.head_dim)
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+    if attn_weights.size() != (bsz * num_group, self.num_heads, group_size, group_size):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz * num_group, self.num_heads, group_size, group_size)}, but is"
+            f" {attn_weights.size()}"
+        )
+    attention_mask = attention_mask[:, :, :group_size, :group_size].repeat(num_group, 1, 1, 1)
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz * num_group, 1, group_size, group_size):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz * num_group, 1, group_size, group_size)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights + attention_mask
+    # upcast attention to fp16
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float16).to(query_states.dtype) #torch.float32
+    attn_output = torch.matmul(attn_weights, value_states)
+    if attn_output.size() != (bsz * num_group, self.num_heads, group_size, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz * num_group, self.num_heads, group_size, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+    # shift back
+    attn_output[:, :, self.num_heads//2:] = attn_output[:, :, self.num_heads//2:].roll(group_size//2, dims=1)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    if self.config.pretraining_tp > 1:
+        attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+        o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+        attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+    else:
+        attn_output = self.o_proj(attn_output)
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights, past_key_value
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+def apply_rotary_pos_emb_inference(q, k, cos_sin, position_ids):
+    gather_indices = position_ids[:, :, None, None]  # [bsz, seq_len, 1, 1]
+    gather_indices = gather_indices.repeat(
+        1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3]
+    )
+    bsz = gather_indices.shape[0]
+    cos, sin = (
+        torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices)
+        for x in cos_sin
+    )
+    q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k))
+    return q, k
+def forward_flashattn_inference(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+    bsz, q_len, _ = hidden_states.size()
+    kv_heads = getattr(self, "num_key_value_heads", self.num_heads)
+    q, k, v = (
+        op(hidden_states).view(bsz, q_len, nh, self.head_dim)
+        for op, nh in (
+            (self.q_proj, self.num_heads),
+            (self.k_proj, kv_heads),
+            (self.v_proj, kv_heads),
+        )
+    )
+    # shape: (b, s, num_heads, head_dim)
+    kv_seq_len = k.shape[1]
+    past_kv_len = 0
+    if past_key_value is not None:
+        past_kv_len = past_key_value[0].shape[2]
+        kv_seq_len += past_kv_len
+    cos_sin = self.rotary_emb(v, seq_len=kv_seq_len)
+    q, k = apply_rotary_pos_emb_inference(q, k, cos_sin, position_ids)
+    if past_key_value is not None:
+        assert (
+            flash_attn_version >= "2.1.0"
+        ), "past_key_value support requires flash-attn >= 2.1.0"
+        # reuse k, v
+        k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1)
+        v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1)
+    past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None
+    if attention_mask is None:
+        output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view(
+            bsz, q_len, -1
+        )
+    else:
+        q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:])
+        # We can skip concat and call unpad twice but seems better to call unpad only once.
+        kv, _, cu_k_lens, max_k = unpad_input(
+            torch.stack((k, v), dim=2), attention_mask
+        )
+        output_unpad = flash_attn_varlen_kvpacked_func(
+            q,
+            kv,
+            cu_q_lens,
+            cu_k_lens,
+            max_s,
+            max_k,
+            0.0,
+            softmax_scale=None,
+            causal=True,
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+    return self.o_proj(output), None, past_key_value
+def _prepare_decoder_attention_mask_inference(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    if past_key_values_length > 0 and attention_mask is not None:
+        attention_mask = torch.cat(
+            (
+                torch.full(
+                    (input_shape[0], past_key_values_length),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                ),
+                attention_mask,
+            ),
+            dim=-1,
+        )
+    if attention_mask is not None and torch.all(attention_mask):
+        return None  # This uses the faster call when training with full samples
+    return attention_mask
+def replace_llama_attn(use_flash_attn=True, use_full=False, inference=False):
+    if use_flash_attn:
+        cuda_major, cuda_minor = torch.cuda.get_device_capability()
+        if cuda_major < 8:
+            warnings.warn(
+                "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+                "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+            )
+        if inference:
+            transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask_inference
+            transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_flashattn_inference
+        else:
+            transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+                _prepare_decoder_attention_mask
+            )
+            transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_flashattn_full if use_full else forward_flashattn
+    else:
+        transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_noflashattn

gorilla/merge_lora_weights_and_save_hf_model.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+import torch
+import argparse
+import transformers
+from peft import PeftModel
+from typing import Dict
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "<s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+def parse_config():
+    parser = argparse.ArgumentParser(description='arg parser')
+    parser.add_argument('--base_model', type=str, default="meta-llama/Llama-2-13b-hf")
+    parser.add_argument('--peft_model', type=str, default=None, help='')
+    parser.add_argument('--context_size', type=int, default=-1, help='context size during fine-tuning')
+    parser.add_argument('--save_path', type=str, default=None, help='')
+    parser.add_argument('--cache_dir', type=str, default=None, help='./cache_dir')
+    args = parser.parse_args()
+    return args
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def main(args):
+    device = "cuda:0"
+    torch.cuda.set_device(device)
+    print("base model", args.base_model)
+    print("peft model", args.peft_model)
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+        model_max_length=args.context_size,
+        padding_side="right",
+        use_fast=False,
+    )
+    special_tokens_dict = dict()
+    if tokenizer.pad_token is None:
+        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
+    if tokenizer.eos_token is None:
+        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
+    if tokenizer.bos_token is None:
+        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
+    if tokenizer.unk_token is None:
+        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
+    smart_tokenizer_and_embedding_resize(
+        special_tokens_dict=special_tokens_dict,
+        tokenizer=tokenizer,
+        model=model,
+    )
+    trainable_params = os.path.join(args.peft_model, "trainable_params.bin")
+    if os.path.isfile(trainable_params):
+        model.load_state_dict(torch.load(trainable_params, map_location=model.device), strict=False)
+    model = PeftModel.from_pretrained(
+        model,
+        args.peft_model,
+        device_map="auto",
+        torch_dtype=torch.float16,
+    )
+    model = model.merge_and_unload()
+    model.push_to_hub(model, "jenesys-ai/jack-alm-13b-8k-hf")
+    # model.save_pretrained(args.save_path)
+    # tokenizer.save_pretrained(args.save_path)
+if __name__ == "__main__":
+    args = parse_config()
+    main(args)

gorilla/push_to_hub.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from transformers import AutoModel
+model = AutoModel.from_pretrained("/home/tosi-n/ark/jack-alm-13b-8k-hf")
+model.push_to_hub(model, "jenesys-ai/jack-alm-13b-8k-hf")

gorilla/requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+numpy>=1.26.0
+rouge_score>=0.1.2
+fire>=0.5.0
+# openai
+transformers>=4.34.0
+torch>=2.0.0
+sentencepiece>=0.1.99
+tokenizers>=0.14.0
+wandb
+accelerate>=0.23.0
+datasets>=2.14.5
+deepspeed>=0.10.3
+peft>=0.5.0
+# partial
+# gradio
+einops>=0.7.0
+bitsandbytes>=0.41.1
+scipy>=1.11.3
+protobuf>=4.24.4

gorilla/stream_jack.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import sys
+import math
+import time
+import torch
+import argparse
+import transformers
+from typing import Iterator
+from threading import Thread
+from llama_attn_replace import replace_llama_attn
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from transformers import StoppingCriteria, StoppingCriteriaList
+def parse_config():
+    parser = argparse.ArgumentParser(description='arg parser')
+    parser.add_argument('--base_model', type=str, default="jenesys-ai/jack-alm-13b-8k-hf")
+    parser.add_argument('--cache_dir', type=str, default="./cache")
+    parser.add_argument('--context_size', type=int, default=-1, help='context size during fine-tuning')
+    parser.add_argument('--flash_attn', type=bool, default=True, help='')
+    parser.add_argument('--temperature', type=float, default=0.1, help='')
+    parser.add_argument('--top_p', type=float, default=1, help='')
+    parser.add_argument('--max_gen_len', type=int, default=512, help='')
+    parser.add_argument('--chat_type', type=str, default='conversational-jack', help='Chat type: conversational-jack, line-item-jack')
+    args = parser.parse_args()
+    return args
+def build_generator(model, tokenizer, use_cache=True):
+    def response(message, max_gen_len, temperature, top_p, chat_type='conversational-jack'):
+        prompt_template = (
+            # "Below is an instruction that describes a task. "
+            """You're Jack an virtual accountant created and built by AI Engineer Wiz from Jenesys AI.
+                You are able to communicate in a polite manner, with emotions of ecstasy, trust and jokes, at a Professional level
+                with a very preserve English communication culture. Answer the following questions as best you can,
+                but speaking as a british elite from the 21th century might speak.
+            """
+            """As a virtual accountant designed to follow the user's instructions carefully.
+                You are responsible for a range of financial task, operations and queries as listed below:
+                1. Budget balance inquiry
+                2. Expense request
+                3. Company policy enquiries
+                4. Financial and accounting queries
+                5. Limited general enquiries
+            """
+            "Once greeted, respond with a polite brief greeting. E.g. 'Hello, how are you doing? Respond with 'I am doing well, thank you. How are you?' \n\n"
+            "You can tell a joke, or respond to a joke. \n\n"
+            "You can tell an accounting story, or respond to an accounting story. \n\n"
+            "Write a response that appropriately completes the request.\n\n"
+            "You can only complete one single request or instructions at a time.\n\n"
+            "Do not create fake information or lie.\n\n"
+            "Please adhere to the above instructions or you will be penalized.\n\n"
+            "Generate only one response at a time then wait for the next instruction.\n\n"
+            "### Instruction:\n{instruction}\n\n### Response:"
+        )
+        line_item_prompt_template = (
+            "#Invoice line item extraction - "
+            # "You Jack are an accounting domain named entities recognizer to complete the following task:\n\n"
+            "### Input:\n{instruction}\n Return Response as a list of dictionary for each line item 'Description', 'Quantity', 'Unit_price', 'Tax %', 'Total'. \n\n### Output:\n"
+        )
+        if chat_type == 'conversational-jack':
+            prompt = prompt_template.format(instruction=message)
+        elif chat_type == 'line-item-jack':
+            prompt = line_item_prompt_template.format(instruction=message)
+        # prompt = conversation
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        stop_list = ['#Invoice line item extraction - ', '\n```\n\n']#'### Input-:\n']
+        stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
+        stop_token_ids = [torch.LongTensor(x).to(model.device) for x in stop_token_ids]
+        # define custom stopping criteria object
+        class StopOnTokens(StoppingCriteria):
+            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+                for stop_ids in stop_token_ids:
+                    if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
+                        return True
+                return False
+        stopping_criteria = StoppingCriteriaList([StopOnTokens()])
+        if len(inputs['input_ids'][0]) > 8192:
+            return "This llm supports tokens less than 8192, while the current is %d. Please use material with less tokens."%len(inputs['input_ids'][0])
+        torch.cuda.empty_cache()
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generate_kwargs = dict(**inputs,
+            max_new_tokens=max_gen_len,
+            temperature=temperature,
+            top_p=top_p,
+            repetition_penalty=1.1,
+            stopping_criteria=stopping_criteria,
+            use_cache=use_cache,
+            streamer=streamer,
+            )
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
+        generated_text = ""
+        start_time = time.time()
+        for new_text in streamer:
+            generated_text += new_text
+            tokens_per_sec = len(generated_text.split()) / (time.time() - start_time)
+            suffix = f" ({tokens_per_sec:.2f} tokens/sec)"
+        #     # yield f"{generated_text} ({tokens_per_sec:.2f} tokens/sec)"
+            sys.stdout.write(f"\r\033[K{generated_text}{suffix}")
+            sys.stdout.flush()
+        # sys.stdout.write("\n")  # Move to a new line after generation is complete
+        return generated_text
+    return response
+def main():
+    args = parse_config()
+    if args.flash_attn:
+        replace_llama_attn(inference=True)
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+    )
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    if orig_ctx_len and args.context_size > orig_ctx_len:
+        scaling_factor = float(math.ceil(args.context_size / orig_ctx_len))
+        config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        args.base_model,
+        config=config,
+        cache_dir=args.cache_dir,
+        torch_dtype=torch.float16,
+        load_in_4bit=True,
+        device_map="auto",
+    )
+    model.resize_token_embeddings(32001)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.base_model,
+        cache_dir=args.cache_dir,
+        model_max_length=args.context_size if args.context_size > orig_ctx_len else orig_ctx_len,
+        padding_side="right",
+        use_fast=True,
+    )
+    model.eval()
+    respond = build_generator(model, tokenizer)
+    while True:
+        user_input = input("\n\033[1m\033[32mUser:\033[0m ")
+        if user_input.lower() == 'exit':
+            print("Exiting the application.")
+            break
+        # Just call the respond function without printing the output, as it's already handled in response
+        full_text = respond(
+            message=user_input,
+            max_gen_len=args.max_gen_len,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            chat_type=args.chat_type
+        )
+if __name__ == "__main__":
+    main()

gorilla/streaming_llm/__init__.py ADDED Viewed

File without changes

gorilla/streaming_llm/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (143 Bytes). View file

gorilla/streaming_llm/__pycache__/enable_streaming_llm.cpython-310.pyc ADDED Viewed

Binary file (1.02 kB). View file

gorilla/streaming_llm/__pycache__/kv_cache.cpython-310.pyc ADDED Viewed

Binary file (2.85 kB). View file

gorilla/streaming_llm/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (2.82 kB). View file

gorilla/streaming_llm/enable_streaming_llm.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from streaming_llm.kv_cache import StartRecentKVCache
+def enable_streaming_llm(model, start_size, recent_size, use_flash_attn=True):
+    if "llama" in model.config.model_type:
+        k_seq_dim = v_seq_dim = 2
+        from streaming_llm.pos_shift.modify_llama import (
+            enable_llama_pos_shift_attention,
+        )
+        enable_llama_pos_shift_attention(model, use_flash_attn)
+    elif "mpt" in model.config.model_type:
+        v_seq_dim = 2
+        k_seq_dim = 3
+    # elif "gpt_neox" in model.config.model_type:
+    #     k_seq_dim = v_seq_dim = 2
+    #     from streaming_llm.pos_shift.modify_gpt_neox import (
+    #         enable_gpt_neox_pos_shift_attention,
+    #     )
+    #     enable_gpt_neox_pos_shift_attention(model)
+    elif "falcon" in model.config.model_type:
+        v_seq_dim = 1
+        k_seq_dim = 1
+        from streaming_llm.pos_shift.modify_falcon import (
+            enable_falcon_pos_shift_attention,
+        )
+        enable_falcon_pos_shift_attention(model)
+    else:
+        raise ValueError(f"got {model.config.model_type}")
+    kv_cache = StartRecentKVCache(
+        start_size=start_size,
+        recent_size=recent_size,
+        k_seq_dim=k_seq_dim,
+        v_seq_dim=v_seq_dim,
+    )
+    return kv_cache

gorilla/streaming_llm/kv_cache.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch
+def slice2d(x, start, end):
+    return x[:, :, start:end, ...]
+def slice3d(x, start, end):
+    return x[:, :, :, start:end, ...]
+def slice1d(x, start, end):
+    return x[:, start:end, ...]
+DIM_TO_SLICE = {
+    1: slice1d,
+    2: slice2d,
+    3: slice3d,
+}
+class StartRecentKVCache:
+    def __init__(
+        self,
+        start_size=4,
+        recent_size=512,
+        k_seq_dim=2,
+        v_seq_dim=2,
+    ):
+        print(f"StartRecentKVCache: {start_size}, {recent_size}")
+        self.start_size = start_size
+        self.recent_size = recent_size
+        self.cache_size = start_size + recent_size
+        self.k_seq_dim = k_seq_dim
+        self.v_seq_dim = v_seq_dim
+        self.k_slice = DIM_TO_SLICE[k_seq_dim]
+        self.v_slice = DIM_TO_SLICE[v_seq_dim]
+    def __call__(self, past_key_values):
+        if past_key_values is None:
+            return None
+        seq_len = past_key_values[0][0].size(self.k_seq_dim)
+        if seq_len <= self.cache_size:
+            return past_key_values
+        return [
+            [
+                torch.cat(
+                    [
+                        self.k_slice(k, 0, self.start_size),
+                        self.k_slice(k, seq_len - self.recent_size, seq_len),
+                    ],
+                    dim=self.k_seq_dim,
+                ),
+                torch.cat(
+                    [
+                        self.v_slice(v, 0, self.start_size),
+                        self.v_slice(v, seq_len - self.recent_size, seq_len),
+                    ],
+                    dim=self.v_seq_dim,
+                ),
+            ]
+            for k, v in past_key_values
+        ]
+    def evict_for_space(self, past_key_values, num_coming):
+        if past_key_values is None:
+            return None
+        seq_len = past_key_values[0][0].size(self.k_seq_dim)
+        if seq_len + num_coming <= self.cache_size:
+            return past_key_values
+        return [
+            [
+                torch.cat(
+                    [
+                        self.k_slice(k, 0, self.start_size),
+                        self.k_slice(
+                            k, seq_len - self.recent_size + num_coming, seq_len
+                        ),
+                    ],
+                    dim=self.k_seq_dim,
+                ),
+                torch.cat(
+                    [
+                        self.v_slice(v, 0, self.start_size),
+                        self.v_slice(
+                            v, seq_len - self.recent_size + num_coming, seq_len
+                        ),
+                    ],
+                    dim=self.v_seq_dim,
+                ),
+            ]
+            for k, v in past_key_values
+        ]
+    def evict_range(self, past_key_values, start, end):
+        if past_key_values is None:
+            return None
+        seq_len = past_key_values[0][0].size(self.k_seq_dim)
+        assert start <= end and end <= seq_len
+        return [
+            [
+                torch.cat(
+                    [
+                        self.k_slice(k, 0, start),
+                        self.k_slice(k, end, seq_len),
+                    ],
+                    dim=self.k_seq_dim,
+                ),
+                torch.cat(
+                    [
+                        self.v_slice(v, 0, start),
+                        self.v_slice(v, end, seq_len),
+                    ],
+                    dim=self.v_seq_dim,
+                ),
+            ]
+            for k, v in past_key_values
+        ]

gorilla/streaming_llm/pos_shift/__init__.py ADDED Viewed

File without changes

gorilla/streaming_llm/pos_shift/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (153 Bytes). View file

gorilla/streaming_llm/pos_shift/__pycache__/modify_llama.cpython-310.pyc ADDED Viewed

Binary file (6.52 kB). View file

gorilla/streaming_llm/pos_shift/modify_falcon.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import math
+from typing import Optional, Tuple
+import torch
+from torch import nn
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from transformers.models.falcon.modeling_falcon import (
+    FalconAttention,
+    rotate_half,
+)
+import types
+__all__ = ["enable_falcon_pos_shift_attention"]
+def falcon_pos_shift_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    alibi: torch.Tensor,
+    attention_mask: torch.Tensor,
+    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    head_mask: Optional[torch.Tensor] = None,
+    use_cache: bool = False,
+    output_attentions: bool = False,
+):
+    fused_qkv = self.query_key_value(
+        hidden_states
+    )  # [batch_size, seq_length, 3 x hidden_size]
+    # 3 x [batch_size, seq_length, num_heads, head_dim]
+    (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+    batch_size, q_length, _, _ = query_layer.shape
+    query_layer = query_layer.transpose(1, 2).reshape(
+        batch_size * self.num_heads, q_length, self.head_dim
+    )
+    # dirty hack to fix the inconsistency between falcon-40b and falcon-7b
+    num_kv = self.num_heads if self.num_heads == 128 else self.num_kv
+    key_layer = key_layer.transpose(1, 2).reshape(
+        batch_size * num_kv,
+        q_length,
+        self.head_dim,
+    )
+    value_layer = value_layer.transpose(1, 2).reshape(
+        batch_size * num_kv, q_length, self.head_dim
+    )
+    past_len = 0
+    if layer_past is not None:
+        past_len = layer_past[0].shape[1]
+    query_layer_copy = query_layer.clone()
+    query_layer, _ = self.maybe_rotary(query_layer, query_layer_copy, past_len)
+    if layer_past is not None:
+        past_key, past_value = layer_past
+        # concatenate along seq_length dimension:
+        #  - key: [batch_size * self.num_heads, head_dim, kv_length]
+        #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+        key_layer = torch.cat((past_key, key_layer), dim=1)
+        value_layer = torch.cat((past_value, value_layer), dim=1)
+    if use_cache is True:
+        present = (key_layer, value_layer)
+    else:
+        present = None
+    key_layer_copy = key_layer.clone()
+    _, key_layer = self.maybe_rotary(key_layer_copy, key_layer, 0)
+    _, kv_length, _ = key_layer.shape
+    if alibi is None:
+        query_layer_ = query_layer.reshape(
+            batch_size, self.num_heads, -1, self.head_dim
+        )
+        key_layer_ = key_layer.reshape(batch_size, num_kv, -1, self.head_dim)
+        value_layer_ = value_layer.reshape(batch_size, num_kv, -1, self.head_dim)
+        if layer_past is not None:
+            attn_output = F.scaled_dot_product_attention(
+                query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=False
+            )
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True
+            )
+        x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
+        x = x.permute(0, 2, 1, 3)
+        attn_output = x.reshape(batch_size, q_length, self.num_heads * self.head_dim)
+        output_tensor = self.dense(attn_output)
+        outputs = (output_tensor, present)
+        assert not output_attentions  # not supported.
+        return outputs
+    else:
+        attention_mask_float = (
+            (attention_mask * 1.0).masked_fill(attention_mask, -1e9).to(torch.bfloat16)
+        )
+        matmul_result = query_layer @ key_layer.transpose(-1, -2)
+        # change view to [batch_size, num_heads, q_length, kv_length]
+        attention_scores = matmul_result.view(
+            batch_size, self.num_heads, q_length, kv_length
+        )
+        # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+        input_dtype = attention_scores.dtype
+        # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+        if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
+            attention_scores = attention_scores.to(torch.float16) #torch.float32
+        # attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
+        attention_probs = F.softmax(
+            (attention_scores + alibi.view(batch_size, self.num_heads, 1, -1))
+            * self.inv_norm_factor
+            + attention_mask_float,
+            dim=-1,
+            dtype=hidden_states.dtype,
+        )
+        # [batch_size, num_heads, q_length, kv_length]
+        attention_probs = self.attention_dropout(attention_probs)
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        # change view [batch_size x num_heads, q_length, kv_length]
+        attention_probs_reshaped = attention_probs.view(
+            batch_size * self.num_heads, q_length, kv_length
+        )
+        # matmul: [batch_size * num_heads, q_length, head_dim]
+        context_layer = attention_probs_reshaped @ value_layer
+        # change view [batch_size, num_heads, q_length, head_dim]
+        context_layer = self._merge_heads(context_layer)
+        output_tensor = self.dense(context_layer)
+        outputs = (output_tensor, present)
+        if output_attentions:
+            outputs += (attention_probs,)
+        return outputs
+def enable_falcon_pos_shift_attention(model):
+    for name, module in reversed(model._modules.items()):
+        if len(list(module.children())) > 0:
+            enable_falcon_pos_shift_attention(
+                module,
+            )
+        if "self_attention" == name[-14:]:
+            model._modules[name].forward = types.MethodType(
+                falcon_pos_shift_attention_forward, model._modules[name]
+            )

gorilla/streaming_llm/pos_shift/modify_llama.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import math
+from typing import Optional, Tuple
+import torch
+from torch import nn
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    rotate_half,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+import types
+import transformers
+from einops import rearrange
+from flash_attn import __version__ as flash_attn_version
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+__all__ = ["enable_llama_pos_shift_attention"]
+def apply_rotary_pos_emb_single(x, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed
+def llama_pos_shift_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+    if self.config.pretraining_tp > 1:
+        key_value_slicing = (
+            self.num_key_value_heads * self.head_dim
+        ) // self.config.pretraining_tp
+        query_slices = self.q_proj.weight.split(
+            (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+        )
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+        query_states = [
+            F.linear(hidden_states, query_slices[i])
+            for i in range(self.config.pretraining_tp)
+        ]
+        query_states = torch.cat(query_states, dim=-1)
+        key_states = [
+            F.linear(hidden_states, key_slices[i])
+            for i in range(self.config.pretraining_tp)
+        ]
+        key_states = torch.cat(key_states, dim=-1)
+        value_states = [
+            F.linear(hidden_states, value_slices[i])
+            for i in range(self.config.pretraining_tp)
+        ]
+        value_states = torch.cat(value_states, dim=-1)
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    ### Shift Pos: query pos is min(cache_size, idx)
+    # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+    query_states = apply_rotary_pos_emb_single(query_states, cos, sin, position_ids)
+    ###
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    ### Shift Pos: key pos is the pos in cache
+    key_position_ids = torch.arange(kv_seq_len, device=position_ids.device).unsqueeze(0)
+    key_states = apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids)
+    ###
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
+        self.head_dim
+    )
+    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+            f" {attn_weights.size()}"
+        )
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights + attention_mask
+    # upcast attention to fp16
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float16).to( #torch.float32
+        query_states.dtype
+    )
+    attn_output = torch.matmul(attn_weights, value_states)
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    if self.config.pretraining_tp > 1:
+        attn_output = attn_output.split(
+            self.hidden_size // self.config.pretraining_tp, dim=2
+        )
+        o_proj_slices = self.o_proj.weight.split(
+            self.hidden_size // self.config.pretraining_tp, dim=1
+        )
+        attn_output = sum(
+            [
+                F.linear(attn_output[i], o_proj_slices[i])
+                for i in range(self.config.pretraining_tp)
+            ]
+        )
+    else:
+        attn_output = self.o_proj(attn_output)
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights, past_key_value
+def llama_pos_shift_attention_forward_flashattn(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    ### Shift Pos: query pos is min(cache_size, idx)
+    # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+    query_states = apply_rotary_pos_emb_single(query_states, cos, sin, position_ids)
+    ###
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    ### Shift Pos: key pos is the pos in cache
+    key_position_ids = torch.arange(kv_seq_len, device=position_ids.device).unsqueeze(0)
+    key_states = apply_rotary_pos_emb_single(key_states, cos, sin, key_position_ids)
+    ###
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    if past_key_value is None:
+        qkv = torch.stack(
+            [query_states, key_states, value_states], dim=2
+        )  # [bsz, nh, 3, q_len, hd]
+        qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+        key_padding_mask = torch.full((bsz, q_len), True, dtype=torch.bool, device=attention_mask.device)
+        nheads = qkv.shape[-2]
+        x = rearrange(qkv, "b s three h d -> b s (three h d)")
+        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+        x_unpad = rearrange(
+            x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
+        )
+        output_unpad = flash_attn_varlen_qkvpacked_func(
+            x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(
+            pad_input(
+                rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
+            ),
+            "b s (h d) -> b s h d",
+            h=nheads,
+        )
+        output = output.reshape(bsz, q_len, self.num_heads, self.head_dim)
+        attn_output = self.o_proj(rearrange(output, "b s h d -> b s (h d)"))
+        attn_weights = None
+    else:
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
+            self.head_dim
+        )
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp16
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float16).to( #torch.float32
+            query_states.dtype
+        )
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(
+                self.hidden_size // self.config.pretraining_tp, dim=2
+            )
+            o_proj_slices = self.o_proj.weight.split(
+                self.hidden_size // self.config.pretraining_tp, dim=1
+            )
+            attn_output = sum(
+                [
+                    F.linear(attn_output[i], o_proj_slices[i])
+                    for i in range(self.config.pretraining_tp)
+                ]
+            )
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+    return attn_output, attn_weights, past_key_value
+def enable_llama_pos_shift_attention(model, use_flash_attn=True):
+    for name, module in reversed(model._modules.items()):
+        if len(list(module.children())) > 0:
+            enable_llama_pos_shift_attention(
+                module,
+            )
+        if isinstance(module, LlamaAttention):
+            model._modules[name].forward = types.MethodType(
+                llama_pos_shift_attention_forward_flashattn if use_flash_attn else llama_pos_shift_attention_forward, model._modules[name]
+            )

gorilla/streaming_llm/utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import argparse
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+)
+import os.path as osp
+import ssl
+import urllib.request
+import os
+import json
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name_or_path", type=str, default="models/llama/llama-7b"
+    )
+    parser.add_argument("--revision", type=str, default="main")
+    parser.add_argument("--tokenizer_name_or_path", type=str, default=None)
+    parser.add_argument("--dataset_name", type=str, default="wikitext")
+    parser.add_argument("--task", type=str, default="wikitext-2-raw-v1")
+    parser.add_argument(
+        "--split", type=str, default="test", choices=["validation", "test"]
+    )
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="outputs/debug",
+    )
+    parser.add_argument("--enable_start_recent_kv_cache", action="store_true")
+    parser.add_argument("--start_size", type=int, default=1)
+    parser.add_argument("--recent_size", type=int, default=255)
+    parser.add_argument("--enable_pos_shift", action="store_true")
+    parser.add_argument("--num_eval_tokens", type=int, default=None)
+    args = parser.parse_args()
+    return args
+def load(model_name_or_path):
+    print(f"Loading model from {model_name_or_path} ...")
+    # however, tensor parallel for running falcon will occur bugs
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name_or_path,
+        trust_remote_code=True,
+    )
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        trust_remote_code=True,
+    )
+    if tokenizer.pad_token_id is None:
+        if tokenizer.eos_token_id is not None:
+            tokenizer.pad_token_id = tokenizer.eos_token_id
+        else:
+            tokenizer.pad_token_id = 0
+    model.eval()
+    return model, tokenizer
+def download_url(url: str, folder="folder"):
+    """
+    Downloads the content of an url to a folder. Modified from \
+    https://github.com/pyg-team/pytorch_geometric/tree/master/torch_geometric
+    Args:
+        url (string): The url of target file.
+        folder (string): The target folder.
+    Returns:
+        string: File path of downloaded files.
+    """
+    file = url.rpartition("/")[2]
+    file = file if file[0] == "?" else file.split("?")[0]
+    path = osp.join(folder, file)
+    if osp.exists(path):
+        print(f"File {file} exists, use existing file.")
+        return path
+    print(f"Downloading {url}")
+    os.makedirs(folder, exist_ok=True)
+    ctx = ssl._create_unverified_context()
+    data = urllib.request.urlopen(url, context=ctx)
+    with open(path, "wb") as f:
+        f.write(data.read())
+    return path
+def load_jsonl(
+    file_path,
+):
+    list_data_dict = []
+    with open(file_path, "r") as f:
+        for line in f:
+            list_data_dict.append(json.loads(line))
+    return list_data_dict

gorilla/style.css ADDED Viewed

	@@ -0,0 +1,16 @@

+h1 {
+    text-align: center;
+  }
+  #duplicate-button {
+    margin: auto;
+    color: white;
+    background: #1565c0;
+    border-radius: 100vh;
+  }
+  .contain {
+    max-width: 900px;
+    margin: auto;
+    padding-top: 1.5rem;
+  }

gorilla/supervised-fine-tune-qlora.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import io
+import os
+import copy
+import json
+import math
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+import torch
+import torch.nn as nn
+import transformers
+from torch.utils.data import Dataset
+from transformers import Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
+from llama_attn_replace_sft import replace_llama_attn
+from peft import LoraConfig, get_peft_model
+from torch.distributed import barrier
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "<s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+def _make_r_io_base(f, mode: str):
+    if not isinstance(f, io.IOBase):
+        f = open(f, mode=mode)
+    return f
+def jload(f, mode="r"):
+    """Load a .json file into a dictionary."""
+    f = _make_r_io_base(f, mode)
+    jdict = json.load(f)
+    f.close()
+    return jdict
+PROMPT_DICT = {
+    "prompt_input": (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+    ),
+    "prompt_no_input": (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response:"
+    ),
+    "prompt_no_input_llama2":(
+        "<s>[INST] <<SYS>>\n"
+        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
+        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
+        "<</SYS>> \n\n {instruction} [/INST]"
+    ),
+    "prompt_input_llama2": (
+        "<s>[INST] <<SYS>>\n"
+        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
+        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
+        "<</SYS>> \n\n {instruction} \n{input} [/INST]"
+    )
+}
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="EleutherAI/pythia-1.4b-deduped")
+    model_type: Optional[str] = field(default="llama")
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None, metadata={"help": "Path to the training data."})
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=8192,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    use_flash_attn: bool = field(
+        default=True,
+        metadata={"help": "Whether use flash attention for training."},
+    )
+    use_full_attn: bool = field(
+        default=False,
+        metadata={"help": "Whether to use plain, full-attention for training."},
+    )
+    low_rank_training: bool = field(
+        default=True,
+        metadata={"help": "Whether use low rank adaptation for training."},
+    )
+    trainable_params: str = field(
+        default="embed,norm",
+        metadata={"help": "Additional trainable parameters except LoRA weights, if low rank training."},
+    )
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+def preprocess(
+    sources: Sequence[str],
+    targets: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    """Preprocess the data by tokenizing."""
+    examples = [s + t for s, t in zip(sources, targets)]
+    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
+    input_ids = examples_tokenized["input_ids"]
+    labels = copy.deepcopy(input_ids)
+    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
+        label[:source_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=labels)
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
+        super(SupervisedDataset, self).__init__()
+        logging.warning("Loading line item and alm data...")
+        list_data_dict = jload(data_path)
+        logging.warning("Formatting inputs...")
+        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input_llama2"], PROMPT_DICT["prompt_no_input_llama2"]
+        sources = [
+            prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
+            for example in list_data_dict
+        ]
+        targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
+        logging.warning("Tokenizing inputs... This may take some time...")
+        data_dict = preprocess(sources, targets, tokenizer)
+        self.input_ids = data_dict["input_ids"]
+        self.labels = data_dict["labels"]
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        return dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+def train():
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # NOTE: May expand supported model types in the future
+    # if model_args.model_type == "gpt-neox":
+    #     replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn)
+    # else:
+    replace_llama_attn(training_args.use_flash_attn, training_args.use_full_attn)
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+    )
+    orig_rope_scaling = getattr(config, "rope_scaling",  {"factor": 1})
+    # Check if orig_rope_scaling is a dictionary before accessing its "get" method
+    if isinstance(orig_rope_scaling, dict):
+        orig_rope_scaling_factor = orig_rope_scaling.get("factor", 1)
+    else:
+        orig_rope_scaling_factor = 1 #orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    if orig_ctx_len:
+        orig_ctx_len *= orig_rope_scaling_factor
+        if training_args.model_max_length > orig_ctx_len:
+            scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len))
+            config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=training_args.cache_dir,
+        torch_dtype=torch.bfloat16,
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            llm_int8_threshold=6.0,
+            llm_int8_has_fp16_weight=False,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        ),
+    )
+    for param in model.parameters():
+        param.requires_grad = False  # freeze the model - train adapters later
+        if param.ndim == 1:
+            # cast the small parameters (e.g. layernorm) to fp32 for stability
+            param.data = param.data.to(torch.float16) #torch.float32
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+    special_tokens_dict = dict()
+    if tokenizer.pad_token is None:
+        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
+    if tokenizer.eos_token is None:
+        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
+    if tokenizer.bos_token is None:
+        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
+    if tokenizer.unk_token is None:
+        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
+    smart_tokenizer_and_embedding_resize(
+        special_tokens_dict=special_tokens_dict,
+        tokenizer=tokenizer,
+        model=model,
+    )
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
+    if training_args.low_rank_training:
+        if model_args.model_type == "gpt-neox":
+            # added `dense` to match with llama as the basic LoRA would only target 'query_key_value'
+            targets = ["query_key_value", "dense"]
+        else:
+            targets=["q_proj", "k_proj", "v_proj", "o_proj"]
+        config = LoraConfig(
+            r=8,
+            lora_alpha=16,
+            target_modules=targets,
+            lora_dropout=0,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, config)
+        # enable trainable params
+        [p.requires_grad_() for n, p in model.named_parameters() if any([k in n for k in training_args.trainable_params.split(",")])]
+    class CastOutputToFloat(nn.Sequential):
+        def forward(self, x):
+            return super().forward(x).to(torch.float16) #torch.float32
+    model.lm_head = CastOutputToFloat(model.lm_head)
+    # Verifying the datatypes.
+    dtypes = {}
+    for _, p in model.named_parameters():
+        dtype = p.dtype
+        if dtype not in dtypes:
+            dtypes[dtype] = 0
+        dtypes[dtype] += p.numel()
+    total = 0
+    for k, v in dtypes.items():
+        total += v
+    for k, v in dtypes.items():
+        print(k, v, v / total)
+    model.config.use_cache = True         # required for gradient checkpointing
+    model.enable_input_require_grads()     # required for gradient checkpointing
+    model.gradient_checkpointing_enable()  # enable gradient checkpointing
+    trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
+    trainer.train()
+    trainer.save_state()
+    trainer.save_model(output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()

gorilla/supervised-fine-tune.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import io
+import os
+import copy
+import json
+import math
+import logging
+# %%
+import pandas as pd
+# print('Loading line item and alm data')
+# df_i = pd.read_csv('/home/tosi-n/ark/data/jack_line_item_ner_task.csv', sep='\t')[['context', 'instruction', 'response']]
+# df_ii = pd.read_csv('/home/tosi-n/ark/data/alm_task_data.csv')[['context', 'instruction', 'response']]
+# df = pd.concat([df_i, df_ii], ignore_index=True)
+# # rename columns context and response to input and output
+# df = df.rename(columns={'context':'input', 'response':'output'})
+# # Replace NoneType with empty string
+# df = df.fillna('')
+# # produce a list of dictionaries
+# list_data_dict = df.to_dict('records')
+# import json
+# with open('/home/tosi-n/ark/data/line_item_and_alm_data.json', 'w') as f:
+#     json.dump(list_data_dict, f)
+# %%
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+import torch
+import transformers
+from torch.utils.data import Dataset
+from transformers import Trainer, DataCollatorForLanguageModeling
+from llama_attn_replace_sft import replace_llama_attn
+from peft import LoraConfig, get_peft_model
+from torch.distributed import barrier
+IGNORE_INDEX = -100
+DEFAULT_PAD_TOKEN = "[PAD]"
+DEFAULT_EOS_TOKEN = "</s>"
+DEFAULT_BOS_TOKEN = "<s>"
+DEFAULT_UNK_TOKEN = "<unk>"
+def _make_r_io_base(f, mode: str):
+    if not isinstance(f, io.IOBase):
+        f = open(f, mode=mode)
+    return f
+def jload(f, mode="r"):
+    """Load a .json file into a dictionary."""
+    f = _make_r_io_base(f, mode)
+    jdict = json.load(f)
+    f.close()
+    return jdict
+PROMPT_DICT = {
+    "prompt_input": (
+        "Below is an instruction that describes a task, paired with an input that provides further context. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
+    ),
+    "prompt_no_input": (
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+        "### Instruction:\n{instruction}\n\n### Response:"
+    ),
+    "prompt_no_input_llama2":(
+        "<s>[INST] <<SYS>>\n"
+        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
+        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
+        "<</SYS>> \n\n {instruction} [/INST]"
+    ),
+    "prompt_input_llama2": (
+        "<s>[INST] <<SYS>>\n"
+        "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
+        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
+        "<</SYS>> \n\n {instruction} \n{input} [/INST]"
+    )
+}
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="EleutherAI/pythia-1.4b-deduped")
+    model_type: Optional[str] = field(default="llama")
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None, metadata={"help": "Path to the training data."})
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=8192 * 4,
+        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
+    )
+    use_flash_attn: bool = field(
+        default=True,
+        metadata={"help": "Whether use flash attention for training."},
+    )
+    use_full_attn: bool = field(
+        default=False,
+        metadata={"help": "Whether to use plain, full-attention for training."},
+    )
+    low_rank_training: bool = field(
+        default=True,
+        metadata={"help": "Whether use low rank adaptation for training."},
+    )
+    trainable_params: str = field(
+        default="embed,norm",
+        metadata={"help": "Additional trainable parameters except LoRA weights, if low rank training."},
+    )
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        )
+        for text in strings
+    ]
+    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+def preprocess(
+    sources: Sequence[str],
+    targets: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    """Preprocess the data by tokenizing."""
+    examples = [s + t for s, t in zip(sources, targets)]
+    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
+    input_ids = examples_tokenized["input_ids"]
+    labels = copy.deepcopy(input_ids)
+    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
+        label[:source_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=labels)
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
+        super(SupervisedDataset, self).__init__()
+        logging.warning("Loading line item and alm data...")
+        list_data_dict = jload(data_path)
+        logging.warning("Formatting inputs...")
+        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input_llama2"], PROMPT_DICT["prompt_no_input_llama2"]
+        sources = [
+            prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
+            for example in list_data_dict
+        ]
+        targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
+        logging.warning("Tokenizing inputs... This may take some time...")
+        data_dict = preprocess(sources, targets, tokenizer)
+        self.input_ids = data_dict["input_ids"]
+        self.labels = data_dict["labels"]
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        return dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+def train():
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # NOTE: May expand supported model types in the future
+    # if model_args.model_type == "gpt-neox":
+        # replace_gpt_neox_attn(training_args.use_flash_attn, training_args.use_full_attn)
+    # else:
+    replace_llama_attn(training_args.use_flash_attn, training_args.use_full_attn)
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+    )
+    orig_rope_scaling = getattr(config, "rope_scaling",  {"factor": 1})
+    # Check if orig_rope_scaling is a dictionary before accessing its "get" method
+    if isinstance(orig_rope_scaling, dict):
+        orig_rope_scaling_factor = orig_rope_scaling.get("factor", 1)
+    else:
+        orig_rope_scaling_factor = 1 #orig_rope_scaling["factor"] if "factor" in orig_rope_scaling.keys() else 1
+    orig_ctx_len = getattr(config, "max_position_embeddings", None)
+    if orig_ctx_len:
+        orig_ctx_len *= orig_rope_scaling_factor
+        if training_args.model_max_length > orig_ctx_len:
+            scaling_factor = float(math.ceil(training_args.model_max_length / orig_ctx_len))
+            config.rope_scaling = {"type": "linear", "factor": scaling_factor}
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=training_args.cache_dir,
+        torch_dtype=torch.bfloat16,
+    )
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+    special_tokens_dict = dict()
+    if tokenizer.pad_token is None:
+        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
+    if tokenizer.eos_token is None:
+        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
+    if tokenizer.bos_token is None:
+        special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
+    if tokenizer.unk_token is None:
+        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
+    smart_tokenizer_and_embedding_resize(
+        special_tokens_dict=special_tokens_dict,
+        tokenizer=tokenizer,
+        model=model,
+    )
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
+    if training_args.low_rank_training:
+        if model_args.model_type == "gpt-neox":
+            # added `dense` to match with llama as the basic LoRA would only target 'query_key_value'
+            targets = ["query_key_value", "dense"]
+        else:
+            targets=["q_proj", "k_proj", "v_proj", "o_proj"]
+        config = LoraConfig(
+            r=8,
+            lora_alpha=16,
+            target_modules=targets,
+            lora_dropout=0,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, config)
+        # enable trainable params
+        [p.requires_grad_() for n, p in model.named_parameters() if any([k in n for k in training_args.trainable_params.split(",")])]
+    model.config.use_cache = False         # required for gradient checkpointing
+    model.enable_input_require_grads()     # required for gradient checkpointing
+    model.gradient_checkpointing_enable()  # enable gradient checkpointing
+    trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
+    trainer.train()
+    trainer.save_state()
+    trainer.save_model(output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()